Ejemplo n.º 1
0
    def _search_iterator(self,
                         q,
                         start,
                         max,
                         count,
                         sort,
                         level='Package',
                         include=False):
        if max and start + max > 10_000:
            raise Exception(
                'Pagination beyond 10000 hits not allowed, use empty max parameter to retrieve full set'
            )

        index = self.index_map.get(level, self.index_name)

        #print(index, self.index_map, flush=True)
        s = Search(using=self.elastic, index=index)
        s.extra(track_total_hits=True)
        s.update_from_dict(q)
        s.source(include)

        m = max or count

        for hit in s[start:start + m] if start + m <= 10_000 else s.scan():
            yield hit.meta.id if not include else (hit.meta.id,
                                                   self._hit_to_desc(hit))
    def query_event_ids(self):
        es_query = []
        es_query.append({'match': {'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME}})
        query = Q({'bool': {'must': es_query}})
        s = Search(using = self.Client, index = "winlogbeat-*").query(query)
        s.source(includes = ['winlog.provider_name', 'winlog.event_id'])

        count = s.count()
        print("Count: %d" % (count))

        event_ids = {}
        i = 0

        try:
            for hit in s.scan():
                print('%d. %d' % (i, hit.winlog.event_id))
                if not hit.winlog.event_id in event_ids:
                    event_ids[hit.winlog.event_id] = 1
                    print("%s: %d" % (hit.winlog.provider_name, hit.winlog.event_id))
                else:
                    event_ids[hit.winlog.event_id] += 1  
                    
                i += 1
        except:
            traceback.print_exc()
    def query_query_names(self, size = 6000, descending = True):
        winlog_event_data_name = "winlog.event_data.QueryName"

        es_query = []
        es_query.append({'match': {'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME}})
        query = Q({'bool': {'must': es_query}})

        s = Search(using = self.Client, index = "winlogbeat-*").query(query)
        s.source(includes = [winlog_event_data_name])\
        
        if descending:
            order = 'desc'
        else:
            order = 'asc'
            
        s.aggs.bucket('distinct_query_name', 'terms', field = winlog_event_data_name, size = size, order = {'_count': order})

        response = s.execute()
        sorted_distinct_query_name = response.aggregations.distinct_query_name

        max_len = 0
        for e in sorted_distinct_query_name:
            if len(e.key) > max_len:
                max_len = len(e.key)
                
        fmt_str = "{0:%d} Count: {1}" % (max_len)
        for e in sorted_distinct_query_name:
            print(fmt_str.format(e.key, e.doc_count))
Ejemplo n.º 4
0
    def _create_request(self,
                        catalog: CatalogName,
                        filters: FiltersJSON,
                        post_filter: bool = False,
                        source_filter: SourceFilters = None,
                        enable_aggregation: bool = True,
                        entity_type='files') -> Search:
        """
        This function will create an ElasticSearch request based on
        the filters and facet_config passed into the function
        :param filters: The 'filters' parameter.
        Assumes to be translated into es_key terms
        :param post_filter: Flag for doing either post_filter or regular
        querying (i.e. faceting or not)
        :param List source_filter: A list of "foo.bar" field paths (see
               https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-request-source-filtering.html)
        :param enable_aggregation: Flag for enabling query aggregation (and
               effectively ignoring facet configuration)
        :param entity_type: the string referring to the entity type used to get
        the ElasticSearch index to search
        :return: Returns the Search object that can be used for executing
        the request
        """
        service_config = self.service_config(catalog)
        field_mapping = service_config.translation
        facet_config = {
            key: field_mapping[key]
            for key in service_config.facets
        }
        es_search = Search(using=self.es_client,
                           index=config.es_index_name(catalog=catalog,
                                                      entity_type=entity_type,
                                                      aggregate=True))
        filters = self._translate_filters(catalog, filters, field_mapping)

        es_query = self._create_query(catalog, filters)

        if post_filter:
            es_search = es_search.post_filter(es_query)
        else:
            es_search = es_search.query(es_query)

        if source_filter:
            es_search = es_search.source(includes=source_filter)
        elif entity_type not in ("files", "bundles"):
            es_search = es_search.source(excludes="bundles")

        if enable_aggregation:
            for agg, translation in facet_config.items():
                # FIXME: Aggregation filters may be redundant when post_filter is false
                #        https://github.com/DataBiosphere/azul/issues/3435
                es_search.aggs.bucket(
                    agg,
                    self._create_aggregate(catalog, filters, facet_config,
                                           agg))

        return es_search
Ejemplo n.º 5
0
def load_genes_by_region(chrom, start, end, features):
    """Retrieve genes by region"""
    index = _get_index_from_chr(chrom)
    search_genes = Search().using(es).doc_type('genes').index(index).filter("range", positions={"lte": end, "gte":start})
    if not features:
        search_genes.source(exclude=['isoforms'])
    genes = [gene.to_dict() for gene in search_genes.scan() ]
    for gene in genes:
        gene['ko_associations'] = load_gene_ko_associations(gene['name'], return_only_significant=True)
    return genes
Ejemplo n.º 6
0
def create_mlt_with_id(document_id, position, index):
    s = Search(using=client, index=index)
    s.source(includes=['*'], excludes=["body"])
    mlt_match = MoreLikeThis(fields=["body.content"],
                             like=[id],
                             min_term_freq=1,
                             min_doc_freq=1)
    nested_query = Nested(path='body', inner_hits={}, query=mlt_match)
    s = s.query(nested_query)
    return s
Ejemplo n.º 7
0
def search_graphs1(request,
                   owner_email=None,
                   names=None,
                   nodes=None,
                   edges=None,
                   tags=None,
                   member_email=None,
                   is_public=None,
                   query=None,
                   limit=20,
                   offset=0,
                   order='desc',
                   sort='name'):
    sort_attr = getattr(db.Graph, sort if sort is not None else 'name')
    orber_by = getattr(db, order if order is not None else 'desc')(sort_attr)
    is_public = int(is_public) if is_public is not None else None

    if member_email is not None:
        member_user = users.controllers.get_user(request, member_email)
        if member_user is not None:
            group_ids = [
                group.id
                for group in users.controllers.get_groups_by_member_id(
                    request, member_user.id)
            ]
        else:
            raise Exception("User with given member_email doesnt exist.")
    else:
        group_ids = None

    if edges is not None:
        edges = [tuple(edge.split(':')) for edge in edges]

    if 'query' in query:
        s = Search(using=settings.ELASTIC_CLIENT, index='graphs')
        s.update_from_dict(query)
        s.source(False)
        graph_ids = [int(hit.meta.id) for hit in s.scan()]
    else:
        graph_ids = None

    total, graphs_list = db.find_graphs(request.db_session,
                                        owner_email=owner_email,
                                        graph_ids=graph_ids,
                                        is_public=is_public,
                                        group_ids=group_ids,
                                        names=names,
                                        nodes=nodes,
                                        edges=edges,
                                        tags=tags,
                                        limit=limit,
                                        offset=offset,
                                        order_by=orber_by)

    return total, graphs_list
Ejemplo n.º 8
0
def load_genes_by_region(chrom, start, end, features):
    """Retrieve genes by region"""
    index = _get_index_from_chr(chrom)
    search_genes = Search().using(es).doc_type('genes').index(index).filter(
        "range", positions={
            "lte": end,
            "gte": start
        })
    if not features:
        search_genes.source(exclude=['isoforms'])
    return [gene.to_dict() for gene in search_genes.scan()]
    def query_distinct_event_ids(self):
        es_query = []
        es_query.append({'match': {'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME}})
        query = Q({'bool': {'must': es_query}})
        s = Search(using = self.Client, index = "winlogbeat-*").query(query)
        s.source(includes = ['winlog.event_id', 'winlog.event_data.LogString'])
        s.aggs.bucket('distinct_event_ids', 'terms', field = 'winlog.event_id', size = 1000)
        response = s.execute()

        sorted_distinct_distinct_event_ids = sorted(response.aggregations.distinct_event_ids, key = lambda kv:(kv.doc_count, kv.key), reverse = True)
        for e in sorted_distinct_distinct_event_ids:
            print("{0:50} {1}".format(e.key, e.doc_count))
Ejemplo n.º 10
0
    def _search(self, query):
        s = Search(using=self.Client, index="winlogbeat-*").query(query)

        if self.DTRange != None:
            s = s.filter('range', **self.DTRange)

        s.source(includes=['winlog.*'])
        s.sort('-winlog.event_data.UtcTime')

        if self.Scan:
            return s.scan()
        else:
            return s.execute().hits
Ejemplo n.º 11
0
    def construct_multi_field_search(  # pylint: disable=too-many-arguments
        search: Search,
        text: str,
        operator: str,
        fields: List[str],
        size: int = 5,
        includes: Optional[List[str]] = None,
        excludes: Optional[List[str]] = None,
    ) -> Search:
        """Construye búsqueda por texto en multiples campos.

        Args:

            search: Búsqueda inicial.
            text: Texto de busquéda.
            operator: Condicional sobre los tokens del texto. Si el
                operator es `and`, entonces para que haya match, todos los
                tokens deben ser encontrados en el índice inverso. Si el
                operator es `or`, entonces para que haya match, al menos un
                token debe ser encontrado en el índice inverso.
                Ver:
                    https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
            size: Tamaño de la busqueda. Valor por defecto 5.
            includes: Control selectivo del campo _source.
                    Retorna solo los campos que se especifican.

            excludes: Control selectivo del campo _source.
                    Excluye los campos que se especifican.

            fields: Nombre de los fields de :class:~`elastinga.schemas.TwitterPosts`
                en donde se quiere buscar

        Returns:
            search: Search

        """

        if includes:
            search = search.source(includes=includes)

        if excludes:
            search = search.source(excludes=excludes)

        search = search.query(
            Q("multi_match", query=text, operator=operator, fields=fields))

        search = search.params(size=size)

        logger.info("Query", query=search.to_dict())

        return search
Ejemplo n.º 12
0
def searchAnotiranaRec(term):
    s = Search(index=index)
    s = s.source(includes=['pk', 'osnovniOblik'])
    s.query = Bool(must=[Match(oblici=term)])

    response = s.execute()
    return response.hits
Ejemplo n.º 13
0
Archivo: rest.py Proyecto: unsftn/rsj
def _search_odrednica(request):
    if not request.GET.get('q'):
        return bad_request('no search term')

    term = request.GET.get('q')
    hits = []
    s = Search(index=ODREDNICA_INDEX)
    s = s.source(includes=['pk', 'rec', 'vrsta', 'rbr_homo'])
    s.query = MultiMatch(
        type='bool_prefix',
        query=remove_punctuation(term),
        fields=['varijante'],
        # analyzer=SERBIAN_ANALYZER
    )
    try:
        response = s.execute()
        for hit in response.hits.hits:
            hits.append(hit['_source'])

        serializer = OdrednicaResponseSerializer(hits, many=True)
        data = serializer.data

        return Response(
            data,
            status=HTTP_200_OK,
            content_type=JSON
        )
    except ElasticsearchException as error:
        return server_error(error.args)
Ejemplo n.º 14
0
Archivo: rest.py Proyecto: unsftn/rsj
def _search_korpus(request):
    if not request.data or request.data['term'] is None:
        return bad_request('no search term')

    term = request.data['term']
    hits = []
    s = Search(index=KORPUS_INDEX)
    s = s.source(includes=['pk', 'osnovniOblik'])
    s.query = Bool(
        must=[Match(oblici=term)]
    )
    try:
        response = s.execute()
        for hit in response.hits.hits:
            hits.append(hit['_source'])

        serializer = KorpusResponseSerializer(hits, many=True)
        data = serializer.data

        return Response(
            data,
            status=HTTP_200_OK,
            content_type=JSON
        )
    except ElasticsearchException as error:
        return server_error(error.args)
Ejemplo n.º 15
0
    def get_indices(self, docTypes: List = ["default"]) -> str:
        """
        Returns a list of all indexes for the given doc types.

        :param docTypes:        List of Doctypes to search, if empty will search all docTypes
        :return:                A string representing indexes to search. (will use * to regroup multiple indices)
        """

        es = get_es_conn()

        indexNamesStr = ""
        if docTypes:
            s = Search(using=es,
                       index=self.typeIndex,
                       doc_type="directory_type").query("ids", values=docTypes)
            s = s.params(scroll=get_scan_scroll_duration(),
                         size=get_nb_documents_per_scan_scroll())

            indexNamesQuery = s.source(["indexName"])
            indexNamesArr = []
            for indexNamePart in indexNamesQuery.scan():
                indexNamesArr.append(indexNamePart["indexName"])
            indexNamesStr = ','.join(indexNamesArr)
        else:
            indexNamesStr = self.dataIndexPrefix + "*"

        return indexNamesStr
Ejemplo n.º 16
0
def test_connections_to_bert_service(created):
    print(f'starting task at {created}')
    from bert_serving.client import BertClient
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT
    from elasticsearch_dsl import Search
    _TEMP_INDEX = "temp_rubert_index"
    bc = BertClient(ip="bert_as_service", check_length=False)

    ind_doc_search = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT)
    ind_doc_search = ind_doc_search.source(['id', 'text'])
    ind_doc_scan = ind_doc_search.scan()

    if ES_CLIENT.indices.exists(_TEMP_INDEX):
        ES_CLIENT.indices.delete(index=_TEMP_INDEX, ignore=[400, 404])

    ES_CLIENT.indices.create(index=_TEMP_INDEX)

    elastic_results = []

    for ind, res in enumerate(ind_doc_scan):
        if ind % 1000 == 0:
            print(f"Current index is {ind}")
        if ind % 25 == 0 and not ind == 0:
            vecs = bc.encode(
                [i['text'] for i in elastic_results]
            ).tolist()
            for ind, vector in enumerate(vecs):
                elastic_results[ind].update({'rubert_embedding': vector})
            persist_in_elastic(ES_CLIENT, elastic_results, _TEMP_INDEX)
            elastic_results = []

        cleaned_text = clean_text(res.text)
        if len(cleaned_text) > 20:
            elastic_results.append(
                {'id': res.id, 'text': cleaned_text})
Ejemplo n.º 17
0
    def get_documents_with_q(self,
                             index,
                             query=Q(),
                             source=None,
                             add_index_name=False):
        """
        Get documents from elasticsearch index
        :param index: elasticsearch index
        :param query: es query
        :param source: extra properties for search
        :return: dataframe with es data
        """

        s = Search(using=self.es, index=index)
        if source:
            s = s.source(source)
        # Dotted fields, replace . by __
        q = s.query(query)
        #print(str(q.to_dict()).replace("'",'"'))
        results = s.query(query).scan()

        if add_index_name:
            all_dicts = []
            for hit in results:
                result_dict = hit.to_dict()
                result_dict['_index'] = hit.meta.index
                all_dicts.append(result_dict)

            fa = pd.DataFrame.from_dict(all_dicts)
        else:
            fa = pd.DataFrame([hit.to_dict() for hit in results])

        return fa
Ejemplo n.º 18
0
 def _get_search_client(self, size=35):
     client = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
     s = Search(using=client)
     s = s.extra(size=size)
     s = s.filter('term', published=True)
     s = s.source(excludes=["html_text"])
     return s
Ejemplo n.º 19
0
def report_all_customers(customer_file,
                         my_index='epl',
                         my_database='duplicate_user'):
    """Outputs all the user keys to file."""
    ## Code based on solution found at:
    ## https://stackoverflow.com/questions/17497075/efficient-way-to-retrieve-all-ids-in-elasticsearch
    es = Elasticsearch()
    # epl/duplicate_user
    s = Search(using=es, index=my_index, doc_type=my_database)
    s = s.source(
        [])  # only get ids, otherwise `fields` takes a list of field names
    ids = [h.meta.id for h in s.scan()]
    try:
        file = open(customer_file, 'w')
    except:
        sys.stderr.write('** error, while attempting to open "{0}"!\n'.format(
            customer_file))
        sys.exit(1)
    count = 0
    for key in ids:
        count = count + 1
        # UKEY
        file.write(key + '\n')
    sys.stderr.write(
        "total user keys in index {0}, doc_type {1}: {2}\n".format(
            my_index, my_database, count))
Ejemplo n.º 20
0
def get_warnings_by_package(package_name, package_warnings):
    '''
        Returns all the warnings for a specific package

        Arguments:
            package_name: the package in the database
            package_warnings: a dict keyed by warning_type we will populate in this function
        Returns:
            None, but populates the package_warnings dict
    '''
    client = Elasticsearch(host=HOST)
    s = Search(using=client)
    s = s.source(['package', 'type', 'severity', 'score'])
    #q = Q("match", type=warning)  & Q("match", severity=severity)
    s = s.query("match", package__keyword=package_name)
    s = s.exclude("match", tag="test_code")
    #print(s.to_dict())

    # process the query
    for hit in s.scan():
        #print(hit.type)
        #print(hit.severity)
        #print(hit.package)

        if hit.type not in package_warnings.keys():
            package_warnings[hit.type] = {}
        if hit.severity in package_warnings[hit.type]:
            package_warnings[hit.type][hit.severity] += 1
        else:
            package_warnings[hit.type][hit.severity] = 0
Ejemplo n.º 21
0
def search_more_like_this(talk):
    """ Get more like this documents
    """
    client = Elasticsearch([{
        'host':
        settings.ELASTICSEARCH['default']['HOSTNAME'],
        'port':
        settings.ELASTICSEARCH['default']['PORT'],
    }])

    s = Search(using=client, index="vtalks")

    s = s.query(
        MoreLikeThis(like={
            "_index": "vtalks",
            "_type": "talk",
            "_id": talk.id
        },
                     fields=['title', 'description', 'tags']))

    # Sorting
    s = s.sort({"_score": {"order": "desc"}})

    # Fields selection
    s = s.source(['id'])

    response = s.execute()

    results_total = response.hits.total
    results_ids = [hit.id for hit in response.hits]

    return results_total, results_ids
Ejemplo n.º 22
0
def get_article_by_title(title, access_groups):
    s = Search(using=client, index=wiki_index_name)
    s = s.filter(access_filter(access_groups))
    s = s.query("term", title__raw=title)
    s = s.source(excludes=["access"])
    res = s.execute()
    return format_article(res)
Ejemplo n.º 23
0
    def _search(self, index, table, fields=None):
        """
        Search private area for matching docs in Elasticsearch.

        only returns the _id of the matching document.

        fields = {
            'id': [1, 2],
            'uid': ['a002', 'a009']
        }
        """
        fields = fields or {}
        search = Search(using=self.__es, index=index)
        # explicitly exclude all fields since we only need the doc _id
        search = search.source(excludes=['*'])
        for key, values in fields.items():
            search = search.query(
                Bool(
                    filter=[
                        Q('terms', **{f'{META}.{table}.{key}': values}) |
                        Q('terms', **{f'{META}.{table}.{key}.keyword': values})
                    ]
                )
            )
        for hit in search.scan():
            yield hit.meta.id
Ejemplo n.º 24
0
def search(client,
           index,
           query,
           start=None,
           end=None,
           source=None,
           sort=None,
           get_scan_obj=False,
           get_search_obj=False):
    from elasticsearch_dsl import Search
    s = Search(using=client, index=index)
    for key, value in query.items():
        if any(
                key.endswith(range_selector)
                for range_selector in ['__gte', '__lte', '__gt', '__lt']):
            range_selector = key.split("__")[-1]
            s = s.filter(
                'range', **{
                    key.replace(f"__{range_selector}", ""): {
                        range_selector: value
                    }
                })
        else:
            s = es_filter_term(s, key, value)
    if source:
        s = s.source(include=source)
    if sort:
        s = s.sort(*sort)
    s = s[start:end]
    if get_scan_obj:
        return s.scan()
    elif get_search_obj:
        return s
    else:
        return s.execute()
Ejemplo n.º 25
0
def search_talks(page=None, sort=None):
    """ Get Talks from by Topic from ElasticSearch
    """
    client = Elasticsearch([{
        'host':
        settings.ELASTICSEARCH['default']['HOSTNAME'],
        'port':
        settings.ELASTICSEARCH['default']['PORT'],
    }])

    s = Search(using=client, index="vtalks")

    # Pagination
    if page:
        start = 0
        end = 10
        if page > 1:
            start = settings.PAGE_SIZE * (page - 1)
            end = settings.PAGE_SIZE * page
        s = s[start:end]

    # Sorting
    s = s.sort({sort: {"order": "desc"}})

    # Fields selection
    s = s.source(['id'])

    response = s.execute()

    results_total = response.hits.total
    results_ids = [hit.id for hit in response.hits]

    return results_total, results_ids
Ejemplo n.º 26
0
    def documents_by_text(self, grouped_targets: dict, queries: list,
                          from_index: int, size: int) -> tuple:
        """
        Paginated documents found by text.
        """
        # For pagination/score sorting to work, we need to query all the different corpus indices in the same
        # Elasticsearch query.
        # We are using the grouped target approach like search documents by annotations, event though buckets
        # are inconsequential for text search.
        indices = self.target_text_document_indices(grouped_targets)
        indices_argument = ','.join(indices)

        language_manager = get_language_manager()
        match_queries = [
            to_match_query(language_manager, query) for query in queries
        ]
        grouped_queries = self.group_queries_by_operator(match_queries)

        # A query language restriction, if present, will work automatically via the query text.<language> mapping.
        es = get_es_conn()
        search = Search(using=es, index=indices_argument)
        search = search.source(["title", "language", "source"])

        search.query = Q('bool',
                         must=grouped_queries["must"],
                         must_not=grouped_queries["must_not"],
                         should=grouped_queries["should"])

        search = search[from_index:from_index + size]
        count = search.count()
        documents = [self.map_hit_with_score(hit) for hit in search]

        return count, documents
def elasticsearch_pages(context, sort, page):
    result_limit = int(os.environ['RESULT_LIMIT'])
    max_result_limit = int(os.environ['MAX_RESULT_LIMIT'])
    start = (page - 1) * result_limit
    end = start + result_limit
    domain_query = Q("term", is_banned=False)
    if context["is_up"]:
        domain_query = domain_query & Q("term", is_up=True)
    if not context["show_fh_default"]:
        domain_query = domain_query & Q("term", is_crap=False)
    if not context["show_subdomains"]:
        domain_query = domain_query & Q("term", is_subdomain=False)
    if context["rep"] == "genuine":
        domain_query = domain_query & Q("term", is_genuine=True)
    if context["rep"] == "fake":
        domain_query = domain_query & Q("term", is_fake=True)

    limit = max_result_limit if context["more"] else result_limit

    has_parent_query = Q("has_parent", type="domain", query=domain_query)
    query = Search().filter(has_parent_query).query(
        Q("match", body_stripped=context['search']))
    query = query.highlight_options(
        order='score', encoder='html').highlight('body_stripped')[start:end]
    query = query.source(['title', 'domain_id', 'created_at',
                          'visited_at']).params(request_cache=True)
    return query.execute()
Ejemplo n.º 28
0
class ElasticSearchConnector:
    def __init__(self, host: str, index_name: str, *args, **kwargs):
        self.search = Search(using=Elasticsearch(hosts=host), index=index_name)

    def get_window(self,
                   turbines: tuple,
                   start_date: datetime.datetime,
                   end_date: datetime.datetime,
                   fields=('*', )):
        self.search = self.search.filter("range",
                                         timestamp={
                                             "gte": start_date,
                                             "lt": end_date
                                         })
        self.search = self.search.filter(
            "terms",
            wind_turbine=turbines,
        )

        fields = ('timestamp', ) + fields
        self.search = self.search.source(include=fields)
        self.search = self.search.sort('timestamp')

        hits = list(self.search.scan())
        return hits_to_dataframe(hits)

    def get_plain_data(self,
                       turbines: tuple,
                       start_date: datetime.datetime,
                       end_date: datetime.datetime,
                       fields=('*', )):
        self.search = self.search.filter("range",
                                         timestamp={
                                             "gte": start_date,
                                             "lt": end_date
                                         })
        self.search = self.search.filter(
            "terms",
            wind_turbine=turbines,
        )

        fields = ('timestamp', ) + fields
        self.search = self.search.source(include=fields)
        self.search = self.search.sort('timestamp')

        hits = list(self.search.scan())
        return hits_to_dataframe(hits)
Ejemplo n.º 29
0
def queryES(index, host):
    host_addr = 'http://' + host + ':9200/'
    client = Elasticsearch([host_addr])
    s = Search(using=client, index=index)
    request = s.source(['hash', 'author_date', 'author'])

    response = s.scan()
    return response
Ejemplo n.º 30
0
    def get_accounts(self, account_ids, size=1000):
        s = Search(using='objects', index="objects-account", extra={'size': size })
        s = s.filter('terms', id=account_ids)
        s = s.source([ 'id', 'name', 'options.voting_account'])
        s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis.

        accounts = [hit.to_dict() for hit in s.scan()]
        return accounts
Ejemplo n.º 31
0
    def _search(self, index, mapping, query, real_fields):
        query_dict = json.loads(query)

        e = Elasticsearch(es_url)
        search = Search(index=index, doc_type=mapping).update_from_dict(query_dict).using(e)
        search = search.source(real_fields)  # Select fields to return.
        search = search[0:query_dict.get("size", 10)]  # Select how many documents to return.

        response = search.execute()
        for hit in response:
            yield hit.to_dict()
Ejemplo n.º 32
0
def search_graphs1(request, owner_email=None, names=None, nodes=None, edges=None, tags=None, member_email=None,
                   is_public=None, query=None, limit=20, offset=0, order='desc', sort='name'):
	sort_attr = getattr(db.Graph, sort if sort is not None else 'name')
	orber_by = getattr(db, order if order is not None else 'desc')(sort_attr)
	is_public = int(is_public) if is_public is not None else None

	if member_email is not None:
		member_user = users.controllers.get_user(request, member_email)
		if member_user is not None:
			group_ids = [group.id for group in users.controllers.get_groups_by_member_id(request, member_user.id)]
		else:
			raise Exception("User with given member_email doesnt exist.")
	else:
		group_ids = None

	if edges is not None:
		edges = [tuple(edge.split(':')) for edge in edges]

	if 'query' in query:
		s = Search(using=settings.ELASTIC_CLIENT, index='graphs')
		s.update_from_dict(query)
		s.source(False)
		graph_ids = [int(hit.meta.id) for hit in s.scan()]
	else:
		graph_ids = None

	total, graphs_list = db.find_graphs(request.db_session,
	                                    owner_email=owner_email,
	                                    graph_ids=graph_ids,
	                                    is_public=is_public,
	                                    group_ids=group_ids,
	                                    names=names,
	                                    nodes=nodes,
	                                    edges=edges,
	                                    tags=tags,
	                                    limit=limit,
	                                    offset=offset,
	                                    order_by=orber_by)

	return total, graphs_list
Ejemplo n.º 33
0
def get_all_cans(index, estype=Types.candidate,
                 fields=['id'],
                 status=1, at_most=10000):
    s = Search(using=client)
    s = s.filter('term', _index=index)
    s = s.filter('term', _type=estype)
    s = s.filter('term', status=status)

    s = s.source(include=fields)
    s = s[:at_most]
    resp = s.execute()
    # print(resp.took)
    # print(resp.hits.total)
    return [hit['id'] for hit in resp]
Ejemplo n.º 34
0
def get_all_job_cans(index, estype=Types.job_candidate,
                     fields=['id', 'job', 'candidate'],
                     status=None, at_most=10000):
    s = Search(using=client)
    s = s.filter('term', _index=index)
    s = s.filter('term', _type=estype)
    if status:
        s = s.filter('term', status=status)

    s = s.source(include=fields)
    s = s[:at_most]
    resp = s.execute()
    # print(resp.took)
    # print(resp.hits.total)
    return [{'id': hit['id'],
             'job_id': hit['job'],
             'can_id': hit['candidate']} for hit in resp]
Ejemplo n.º 35
0
def search_cans():
    work_years = 0
    salary_low = 1000
    salary_high = 50000
    status = 0
    job_3l_nums = [
        "13040342",
        "02250254",
        "02250166",
        "04550449",
        "00510085"
      ]

    s = Search(using=client, index='can_tenant_chouun')

    # s = s.filter('term', _index='can_tenant_chouun')
    s = s.filter('term', _type='candidate')

    q = Q('nested', path='analysis',
          query=Q('term', **{'analysis.job_3l_num': job_3l_nums[0]}))
    for job_3l_num in job_3l_nums[1:]:
        q |= Q('nested', path='analysis',
               query=Q('term', **{'analysis.job_3l_num': job_3l_num}))
    # s = s.filter('term', status=status)
    s = s.query(q)
    s = s.query(Q('nested', path='analysis',
                  query=Q('range', **{'analysis.salary': {'gte': int(salary_low) - 3000}})))
    s = s.filter('range', years={
        'lte': datetime.date.today().year - int(work_years)
    })

    s = s.source(include=['id', 'analysis'])
    s = s[0:200]

    resp = s.execute()
    print(resp['took'])
    print(resp['hits']['total'])
Ejemplo n.º 36
0
    def _build_query(self):
        query = Q()

        source = ['id']
        sort = []

        aggregations = {}
        query_string = None
        as_list = as_dict = False

        for action, value in self.steps:
            if action == 'order_by':
                for key in value:
                    if key.startswith('-'):
                        sort.append({key[1:]: 'desc'})
                    else:
                        sort.append(key)
            elif action == 'values':
                source.extend(value)
                as_list, as_dict = True, False
            elif action == 'values_dict':
                if value:
                    source.extend(value)
                as_list, as_dict = False, True
            elif action == 'query':
                query &= self._process_queries(value)
            elif action == 'filter':
                query &= self._process_filters(value)
            elif action == 'source':
                source.extend(value)
            elif action == 'aggregate':
                aggregations.update(value)
            elif action == 'filter_query_string':
                query_string = value
            else:
                raise NotImplementedError(action)

        # If we have a raw query string we are going to apply all sorts
        # of boosts and filters to improve relevance scoring.
        #
        # We are using the same rules that `search.filters:SearchQueryFilter`
        # implements to have a single-source of truth for how our
        # scoring works.
        from olympia.search.filters import SearchQueryFilter

        search = Search().query(query)

        if query_string:
            search = SearchQueryFilter().apply_search_query(
                query_string, search)

        if sort:
            search = search.sort(*sort)

        if source:
            search = search.source(source)

        body = search.to_dict()

        # These are manually added for now to simplify a partial port to
        # elasticsearch-dsl
        if self.start:
            body['from'] = self.start
        if self.stop is not None:
            body['size'] = self.stop - self.start
        if aggregations:
            body['aggs'] = aggregations

        self.source, self.as_list, self.as_dict = source, as_list, as_dict
        return body
Ejemplo n.º 37
0
class EsRdfBulkLoader(object):
    """ Bulk loads data from the triplestore to elasticsearch """

    log_level = logging.DEBUG

    def __init__(self, rdf_class, tstore_conn, search_conn, **kwargs):
        log.setLevel(self.log_level)
        self.tstore_conn = tstore_conn
        self.search_conn = search_conn

        try:
            self.es_index = rdf_class.es_defs.get('kds_esIndex')[0]
            self.es_doc_type = rdf_class.es_defs.get('kds_esDocType')[0]
        except TypeError:
            log.warn("'%s' is NOT cofigured for indexing to elasticsearch",
                     rdf_class)
            return
        self.search = Search(using=search_conn.es).index(self.es_index)
        self.rdf_class = rdf_class
        self._set_es_workers(**kwargs)
        self.idx_start_time = XsdDatetime(datetime.datetime.utcnow())
        # add all of the sublcasses for a rdf_class
        self.rdf_types = [rdf_class.uri] + [item.uri
                                            for item in rdf_class.subclasses]
        # self.query = self.items_query_template.format(
        #         rdf_types="\n\t\t".join(rdf_types),
        #         idx_start_time=XsdDatetime(datetime.datetime.utcnow()).sparql)
        EsMappings().initialize_indices()
        if kwargs.get("reset_idx"):
            self.delete_idx_status(self.rdf_class)
        self.count = 0
        kwargs['uri_list'] = self.get_uri_list()
        # self._index_group_with_subgroup(**kwargs)
        while len(kwargs['uri_list']) > 0:
            self._index_group_with_subgroup(**kwargs)
            kwargs['uri_list'] = self.get_uri_list()

    def _set_es_workers(self, **kwargs):
        """
        Creates index worker instances for each class to index

        kwargs:
        -------
            idx_only_base[bool]: True will only index the base class
        """
        def make_es_worker(search_conn, es_index, es_doc_type, class_name):
            """
            Returns a new es_worker instance

            args:
            -----
                search_conn: the connection to elasticsearch
                es_index: the name of the elasticsearch index
                es_doc_type: the name of the elasticsearch doctype
                class_name: name of the rdf class that is being indexed
            """
            new_esbase = copy.copy(search_conn)
            new_esbase.es_index = es_index
            new_esbase.doc_type = es_doc_type
            log.info("Indexing '%s' into ES index '%s' doctype '%s'",
                     class_name.pyuri,
                     es_index,
                     es_doc_type)
            return new_esbase

        def additional_indexers(rdf_class):
            """
            returns additional classes to index based off of the es definitions
            """
            rtn_list = rdf_class.es_indexers()
            rtn_list.remove(rdf_class)
            return rtn_list


        self.es_worker = make_es_worker(self.search_conn,
                                        self.es_index,
                                        self.es_doc_type,
                                        self.rdf_class.__name__)
        if not kwargs.get("idx_only_base"):
            self.other_indexers = {item.__name__: make_es_worker(
                        self.search_conn,
                        item.es_defs.get('kds_esIndex')[0],
                        item.es_defs.get('kds_esDocType')[0],
                        item.__name__)
                    for item in additional_indexers(self.rdf_class)}
        else:
            self.other_indexers = {}

    def _index_sub(self, uri_list, num, batch_num):
        """
        Converts a list of uris to elasticsearch json objects

        args:
            uri_list: list of uris to convert
            num: the ending count within the batch
            batch_num: the batch number
        """
        bname = '%s-%s' % (batch_num, num)
        log.debug("batch_num '%s' starting es_json conversion",
                  bname)
        qry_data = get_all_item_data([item[0] for item in uri_list],
                                     self.tstore_conn,
                                     rdfclass=self.rdf_class)
        log.debug("batch_num '%s-%s' query_complete | count: %s",
                  batch_num,
                  num,
                  len(qry_data))
        # path = os.path.join(CFG.dirs.cache, "index_pre")
        # if not os.path.exists(path):
        #     os.makedirs(path)
        # with open(os.path.join(path, bname + ".json"), "w") as fo:
        #     fo.write(json.dumps(qry_data))
        data = RdfDataset(qry_data)
        del qry_data
        log.debug("batch_num '%s-%s' RdfDataset Loaded", batch_num, num)
        for value in uri_list:
            try:

                self.batch_data[batch_num]['main'].append(\
                        data[value[0]].es_json())
                self.count += 1
            except KeyError:
                pass
        for name, indexer in self.other_indexers.items():
            for item in data.json_qry("$.:%s" % name.pyuri):
                val = item.es_json()
                if val:
                    self.batch_data[batch_num][name].append(val)
                    self.batch_uris[batch_num].append(item.subject)
        del data
        del uri_list
        log.debug("batch_num '%s-%s' converted to es_json", batch_num, num)

    def get_uri_list(self, **kwargs):
        """
        Returns a list of Uris to index
        """
        index_status_filter = """
                optional {{ ?s dcterm:modified ?modTime }} .
                optional {{ ?s kds:esIndexTime ?time }} .
                optional {{ ?s kds:esIndexError ?error }}
                filter (
                    !(bound(?time)) ||
                    ?time<?modTime  ||
                    (bound(?error) && ?time < {idx_start_time}))
                """.format(idx_start_time=self.idx_start_time.sparql)
        items_query_template = """
            SELECT DISTINCT ?s ?es_id
            {{
                VALUES ?rdftypes {{\n\t\t{rdf_types} }} .
                ?s a ?rdftypes .
                BIND(SHA1(STR(?s)) as ?es_id) .
                {status_filter}
            }}
            {order_by}
            """
        status_filter = index_status_filter \
                        if not kwargs.get("no_status") else ""
        order_by = kwargs.get("order_by", "")
        sparql = items_query_template.format(
                rdf_types="\n\t\t".join(self.rdf_types),
                status_filter=status_filter,
                order_by=order_by)
        results = [(Uri(item['s']['value']), item['es_id']['value'],)
                   for item in self.tstore_conn.query(sparql=sparql)]
        return results #[:100]

    def _index_group_with_subgroup(self, **kwargs):
        """ indexes all the URIs defined by the query into Elasticsearch """

        log.setLevel(self.log_level)
        # get a list of all the uri to index
        uri_list = kwargs.get('uri_list', self.get_uri_list())
        if not uri_list:
            log.info("0 items to index")
            return
        # results = results[:100]
        # Start processing through uri
        batch_file = os.path.join(CFG.dirs.logs, "batch_list.txt")
        # with open(batch_file, "w") as fo:
        #     fo.write("{")
        log.info("'%s' items to index", len(uri_list))
        self.time_start = datetime.datetime.now()
        batch_size = kwargs.get("batch_size", 12000)
        if len(uri_list) > batch_size:
            batch_end = batch_size
        else:
            batch_end = len(uri_list)
        batch_start = 0
        batch_num = 1
        self.batch_data = {}
        self.batch_data[batch_num] = {}
        self.batch_data[batch_num]['main'] = []
        self.batch_uris = {}
        self.batch_uris[batch_num] = []
        for name, indexer in self.other_indexers.items():
            self.batch_data[batch_num][name] = []
        end = False
        last = False
        final_list = []
        expand_index = kwargs.get("expand_index", True)
        while not end:
            log.debug("batch %s: %s-%s", batch_num, batch_start, batch_end)
            sub_batch = []
            j = 0
            for i in range(batch_start, batch_end):
            # for i, subj in enumerate(uri_list[batch_start:batch_end]):
                qry_size = kwargs.get("qry_size", 1000)
                if j < qry_size:
                    try:
                        sub_batch.append(uri_list.pop()) #subj)
                    except IndexError:
                        pass
                if j == qry_size -1 or i == batch_end - 1:
                    try:
                        sub_batch.append(uri_list.pop()) #subj)
                    except IndexError:
                        pass
                    # with open(batch_file, "a") as fo:
                    #     fo.write(json.dumps({str('%s-%s' % (batch_num, i+1)):
                    #                          [item[0].sparql
                    #                           for item in sub_batch]})[1:-1]+",\n")
                    if not kwargs.get("no_threading", False):
                        th = threading.Thread(name=batch_start + i + 1,
                                              target=self._index_sub,
                                              args=(sub_batch,
                                                    i+1,
                                                    batch_num,))
                        th.start()
                    else:
                        self._index_sub(sub_batch, i+1, batch_num)
                    j = 0
                    final_list += sub_batch
                    sub_batch = []
                else:
                    j += 1
            log.debug(datetime.datetime.now() - self.time_start)
            if not kwargs.get("no_threading", False):
                main_thread = threading.main_thread()
                for t in threading.enumerate():
                    if t is main_thread:
                        continue
                    t.join()
            action_list = []
            for key, items in self.batch_data[batch_num].items():
                if key == 'main':
                    es_worker = self.es_worker
                else:
                    es_worker = self.other_indexers[key]
                action_list += es_worker.make_action_list(items)
            result = self.es_worker.bulk_save(action_list)
            final_list += self.batch_uris[batch_num]
            self._update_triplestore(result, action_list)
            del action_list
            del self.batch_uris[batch_num]
            del self.batch_data[batch_num]
            try:
                del pyrdf.memorized
                pyrdf.memorized = {}
            except AttributeError:
                pass
            while gc.collect() > 0:
                pass
            # pdb.set_trace()
            batch_end += batch_size
            batch_start += batch_size
            if last:
                end = True
            if len(uri_list) <= batch_size:
                batch_end = len(uri_list)
                last = True
            batch_num += 1
            self.batch_uris[batch_num] = []
            self.batch_data[batch_num] = {}
            self.batch_data[batch_num]['main'] = []
            for name, indexer in self.other_indexers.items():
                self.batch_data[batch_num][name] = []
            log.debug(datetime.datetime.now() - self.time_start)
        # with open(batch_file, 'rb+') as fo:
        #     fo.seek(-2, os.SEEK_END)
        #     fo.truncate()
        #     # fo.close()
        #     fo.write("}".encode())

    def _update_triplestore(self, es_result, action_list, **kwargs):
        """
        updates the triplestore with success of saves and failues of indexing

        Args:
        -----
            es_result: the elasticsearch result list
            action_list: list of elasticsearch action items that were indexed
        """
        idx_time = XsdDatetime(datetime.datetime.utcnow())
        uri_keys = {}
        bnode_keys = {}
        for item in action_list:
            try:
                uri_keys[item['_id']] = item['_source']["uri"]
            except KeyError:
                bnode_keys[item['_id']] = item['_id']
        error_dict = {}
        error_bnodes = {}
        if es_result[1]:
            for result in es_result[1]:
                err_item = list(result.values())[0]
                try:
                    error_dict[uri_keys.pop(err_item['_id'])] = \
                            XsdString(err_item['error']['reason'])
                except KeyError:
                    error_bnodes[bnode_keys.pop(err_item['_id'])] = \
                            XsdString(err_item['error']['reason'])
        if uri_keys:
            sparql_good = """
                DELETE
                {{
                    ?s kds:esIndexTime ?esTime .
                    ?s kds:esIndexError ?esError .
                }}
                INSERT
                {{
                    GRAPH ?g {{ ?s kds:esIndexTime {idx_time} }}.
                }}
                WHERE
                {{
                    VALUES ?s {{ {subj_list} }} .
                    {{
                        SELECT DISTINCT ?g ?s ?esTime ?esError
                        {{
                            GRAPH ?g {{ ?s ?p ?o }} .
                            OPTIONAL {{
                                ?s kds:esIndexTime ?esTime
                            }}
                            OPTIONAL {{
                                ?s kds:esIndexError ?esError
                            }}
                        }}
                    }}
                }}
                """.format(idx_time=idx_time.sparql,
                           subj_list="<%s>" % ">\n<".join(uri_keys.values()))
            self.tstore_conn.update_query(sparql_good)
        # Process any errors that were found.
        if not error_dict:
            return
        # Delete all indexing triples related to the error subjects
        sparql_error = """
            DELETE
            {{
                ?s kds:esIndexTime ?esTime .
                ?s kds:esIndexError ?esError .
            }}
            WHERE
            {{
                VALUES ?s {{ {subj_list} }} .
                OPTIONAL {{
                    ?s kds:esIndexTime ?esTime
                }}
                OPTIONAL {{
                    ?s kds:esIndexError ?esError
                }}
            }}
            """.format(subj_list="<%s>" % ">\n<".join(error_dict.keys()))
        self.tstore_conn.update_query(sparql_error)
        del sparql_error
        sparql_update = """
            INSERT
            {{
                GRAPH ?g {{
                    ?s kds:esIndexTime {idx_time} .
                    ?s kds:esIndexError ?esError .
                }}
            }}
            WHERE
            {{
                VALUES (?s ?esError) {{ {error_list} }} .
                {{
                    SELECT DISTINCT ?g ?s
                    {{
                        graph ?g {{?s ?p ?o}}
                    }}
                }}
            }}""".format(
                    idx_time=idx_time.sparql,
                    error_list="\n".join(["(<%s> %s)" % (key, val.sparql)
                                          for key, val in error_dict.items()]))

        # Create a turtle data stream of the new errors to upload into the
        # triplestore
        self.tstore_conn.update_query(sparql_update)
        del sparql_update


    def delete_idx_status(self, rdf_class):
        """
        Removes all of the index status triples from the datastore

        Args:
        -----
            rdf_class: The class of items to remove the status from
        """

        sparql_template = """
            DELETE
            {{
                ?s kds:esIndexTime ?esTime .
                ?s kds:esIndexError ?esError .
            }}
            WHERE
            {{

                VALUES ?rdftypes {{\n\t\t{} }} .
                ?s a ?rdftypes .
                OPTIONAL {{
                    ?s kds:esIndexTime ?esTime
                }}
                OPTIONAL {{
                    ?s kds:esIndexError ?esError
                }}
                FILTER(bound(?esTime)||bound(?esError))
            }}
            """
        rdf_types = [rdf_class.uri] + [item.uri
                                       for item in rdf_class.subclasses]
        sparql = sparql_template.format("\n\t\t".join(rdf_types))
        log.warn("Deleting index status for %s", rdf_class.uri)
        return self.tstore_conn.update_query(sparql)

    def get_es_ids(self):
        """
        reads all the elasticssearch ids for an index
        """
        search = self.search.source(['uri']).sort(['uri'])
        es_ids = [item.meta.id for item in search.scan()]
        return es_ids

    def validate_index(self, rdf_class):
        """
        Will compare the triplestore and elasticsearch index to ensure that
        that elasticsearch and triplestore items match. elasticsearch records
        that are not in the triplestore will be deleteed
        """
        es_ids = set(self.get_es_ids())
        tstore_ids = set([item[1]
                          for item in self.get_uri_list(no_status=True)])
        diff = es_ids - tstore_ids
        if diff:
            pdb.set_trace()
            action_list = self.es_worker.make_action_list(diff,
                                                          action_type="delete")
            results = self.es_worker.bulk_save(action_list)