Example #1
0
class Autocomplete:

    def __init__(self, project, indices, limit = 10):
        self.project = project
        self.limit = limit
        self.es = ElasticSearcher(output=ElasticSearcher.OUT_RAW, indices=indices)

    def get_fact_names(self, startswith):
        query = {"aggs": {'fact': {"nested": {"path": "texta_facts"}, "aggs": {'fact': {"terms": {"field": "texta_facts.fact", "size": self.limit, "include": f"{startswith}.*"}}}}}}

        self.es.update_query(query)
        results = self.es.search()

        facts = [a['key'] for a in results['aggregations']['fact']['fact']['buckets']]

        return facts


    def get_fact_values(self, startswith, fact_name):
        query = {"aggs": {'str_val': {"nested": {"path": "texta_facts"}, "aggs": {'str_val': {"terms": {"field": "texta_facts.fact"}, "aggs": {"fact_values": {"terms": {"field": "texta_facts.str_val", "size": self.limit, "include": f"{startswith}.*"}}}}}}}}

        self.es.update_query(query)
        results = self.es.search()

        facts = []
        for bucket in results['aggregations']['str_val']['str_val']['buckets']:
            if bucket['key'] == fact_name:
                facts += [sub_bucket['key'] for sub_bucket in bucket['fact_values']['buckets']]

        return facts

    def get_lexicons(self, startswith):
        # TODO
        pass
Example #2
0
 def _get_sample_document(self, id_field: str, id_value: str, index: str):
     query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict()
     es = ElasticSearcher(query=query, output=ElasticSearcher.OUT_RAW)
     ed = ElasticDocument(index=index)
     response = es.search()["hits"]["hits"]
     document = response[0] if response else None
     return ed, document
Example #3
0
def get_tag_candidates(tagger_group_id: int,
                       text: str,
                       ignore_tags: List[str] = [],
                       n_similar_docs: int = 10,
                       max_candidates: int = 10):
    """
    Finds frequent tags from documents similar to input document.
    Returns empty list if hybrid option false.
    """
    hybrid_tagger_object = TaggerGroup.objects.get(pk=tagger_group_id)
    field_paths = json.loads(hybrid_tagger_object.taggers.first().fields)
    indices = hybrid_tagger_object.get_indices()
    logging.getLogger(INFO_LOGGER).info(
        f"[Get Tag Candidates] Selecting from following indices: {indices}.")
    ignore_tags = {tag["tag"]: True for tag in ignore_tags}
    # create query
    query = Query()
    query.add_mlt(field_paths, text)
    # create Searcher object for MLT
    es_s = ElasticSearcher(indices=indices, query=query.query)
    logging.getLogger(INFO_LOGGER).info(
        f"[Get Tag Candidates] Trying to retrieve {n_similar_docs} documents from Elastic..."
    )
    docs = es_s.search(size=n_similar_docs)
    logging.getLogger(INFO_LOGGER).info(
        f"[Get Tag Candidates] Successfully retrieved {len(docs)} documents from Elastic."
    )
    # dict for tag candidates from elastic
    tag_candidates = {}
    # retrieve tags from elastic response
    for doc in docs:
        if "texta_facts" in doc:
            for fact in doc["texta_facts"]:
                if fact["fact"] == hybrid_tagger_object.fact_name:
                    fact_val = fact["str_val"]
                    if fact_val not in ignore_tags:
                        if fact_val not in tag_candidates:
                            tag_candidates[fact_val] = 0
                        tag_candidates[fact_val] += 1
    # sort and limit candidates
    tag_candidates = [
        item[0] for item in sorted(
            tag_candidates.items(), key=lambda k: k[1], reverse=True)
    ][:max_candidates]
    logging.getLogger(INFO_LOGGER).info(
        f"[Get Tag Candidates] Retrieved {len(tag_candidates)} tag candidates."
    )
    return tag_candidates
Example #4
0
    def post(self, request, project_pk: int):
        """Simplified search interface for making Elasticsearch queries."""
        serializer = ProjectSimplifiedSearchSerializer(data=request.data)
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        project_object = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project_object)
        project_indices = list(project_object.get_indices())
        project_fields = project_object.get_elastic_fields(path_list=True)
        # test if indices exist
        if not project_indices:
            raise ProjectValidationFailed(detail="Project has no indices")
        # test if indices are valid
        if serializer.validated_data['match_indices']:
            if not set(serializer.validated_data['match_indices']).issubset(set(project_indices)):
                raise ProjectValidationFailed(detail=f"Index names are not valid for this project. allowed values are: {project_indices}")
        # test if fields are valid
        if serializer.validated_data['match_fields']:
            if not set(serializer.validated_data['match_fields']).issubset(set(project_fields)):
                raise ProjectValidationFailed(detail=f"Fields names are not valid for this project. allowed values are: {project_fields}")

        es = ElasticSearcher(indices=project_indices, output=ElasticSearcher.OUT_DOC)
        q = Query(operator=serializer.validated_data['operator'])
        # if input is string, convert to list
        # if unknown format, return error
        match_text = serializer.validated_data['match_text']
        if isinstance(match_text, list):
            match_texts = [str(item) for item in match_text if item]
        elif isinstance(match_text, str):
            match_texts = [match_text]
        else:
            return Response({'error': f'match text is in unknown format: {match_text}'}, status=status.HTTP_400_BAD_REQUEST)
        # add query filters
        for item in match_texts:
            q.add_string_filter(item, match_type=serializer.validated_data["match_type"])
        # update query
        es.update_query(q.query)
        # retrieve results
        results = es.search(size=serializer.validated_data["size"])
        return Response(results, status=status.HTTP_200_OK)
Example #5
0
    def post(self, request, project_pk: int):
        """Executes **raw** Elasticsearch query on all project indices."""
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        serializer = ProjectSearchByQuerySerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"])

        if not indices:
            raise ProjectValidationFailed(detail="No indices supplied and project has no indices")

        es = None
        if serializer.validated_data["output_type"]:
            es = ElasticSearcher(indices=indices, output=serializer.validated_data["output_type"])
        else:
            es = ElasticSearcher(indices=indices, output=ElasticSearcher.OUT_DOC_WITH_TOTAL_HL_AGGS)

        es.update_query(serializer.validated_data["query"])
        results = es.search()
        return Response(results, status=status.HTTP_200_OK)