class Autocomplete: def __init__(self, project, indices, limit = 10): self.project = project self.limit = limit self.es = ElasticSearcher(output=ElasticSearcher.OUT_RAW, indices=indices) def get_fact_names(self, startswith): query = {"aggs": {'fact': {"nested": {"path": "texta_facts"}, "aggs": {'fact': {"terms": {"field": "texta_facts.fact", "size": self.limit, "include": f"{startswith}.*"}}}}}} self.es.update_query(query) results = self.es.search() facts = [a['key'] for a in results['aggregations']['fact']['fact']['buckets']] return facts def get_fact_values(self, startswith, fact_name): query = {"aggs": {'str_val': {"nested": {"path": "texta_facts"}, "aggs": {'str_val': {"terms": {"field": "texta_facts.fact"}, "aggs": {"fact_values": {"terms": {"field": "texta_facts.str_val", "size": self.limit, "include": f"{startswith}.*"}}}}}}}} self.es.update_query(query) results = self.es.search() facts = [] for bucket in results['aggregations']['str_val']['str_val']['buckets']: if bucket['key'] == fact_name: facts += [sub_bucket['key'] for sub_bucket in bucket['fact_values']['buckets']] return facts def get_lexicons(self, startswith): # TODO pass
def _get_sample_document(self, id_field: str, id_value: str, index: str): query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict() es = ElasticSearcher(query=query, output=ElasticSearcher.OUT_RAW) ed = ElasticDocument(index=index) response = es.search()["hits"]["hits"] document = response[0] if response else None return ed, document
def get_tag_candidates(tagger_group_id: int, text: str, ignore_tags: List[str] = [], n_similar_docs: int = 10, max_candidates: int = 10): """ Finds frequent tags from documents similar to input document. Returns empty list if hybrid option false. """ hybrid_tagger_object = TaggerGroup.objects.get(pk=tagger_group_id) field_paths = json.loads(hybrid_tagger_object.taggers.first().fields) indices = hybrid_tagger_object.get_indices() logging.getLogger(INFO_LOGGER).info( f"[Get Tag Candidates] Selecting from following indices: {indices}.") ignore_tags = {tag["tag"]: True for tag in ignore_tags} # create query query = Query() query.add_mlt(field_paths, text) # create Searcher object for MLT es_s = ElasticSearcher(indices=indices, query=query.query) logging.getLogger(INFO_LOGGER).info( f"[Get Tag Candidates] Trying to retrieve {n_similar_docs} documents from Elastic..." ) docs = es_s.search(size=n_similar_docs) logging.getLogger(INFO_LOGGER).info( f"[Get Tag Candidates] Successfully retrieved {len(docs)} documents from Elastic." ) # dict for tag candidates from elastic tag_candidates = {} # retrieve tags from elastic response for doc in docs: if "texta_facts" in doc: for fact in doc["texta_facts"]: if fact["fact"] == hybrid_tagger_object.fact_name: fact_val = fact["str_val"] if fact_val not in ignore_tags: if fact_val not in tag_candidates: tag_candidates[fact_val] = 0 tag_candidates[fact_val] += 1 # sort and limit candidates tag_candidates = [ item[0] for item in sorted( tag_candidates.items(), key=lambda k: k[1], reverse=True) ][:max_candidates] logging.getLogger(INFO_LOGGER).info( f"[Get Tag Candidates] Retrieved {len(tag_candidates)} tag candidates." ) return tag_candidates
def post(self, request, project_pk: int): """Simplified search interface for making Elasticsearch queries.""" serializer = ProjectSimplifiedSearchSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) project_object = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project_object) project_indices = list(project_object.get_indices()) project_fields = project_object.get_elastic_fields(path_list=True) # test if indices exist if not project_indices: raise ProjectValidationFailed(detail="Project has no indices") # test if indices are valid if serializer.validated_data['match_indices']: if not set(serializer.validated_data['match_indices']).issubset(set(project_indices)): raise ProjectValidationFailed(detail=f"Index names are not valid for this project. allowed values are: {project_indices}") # test if fields are valid if serializer.validated_data['match_fields']: if not set(serializer.validated_data['match_fields']).issubset(set(project_fields)): raise ProjectValidationFailed(detail=f"Fields names are not valid for this project. allowed values are: {project_fields}") es = ElasticSearcher(indices=project_indices, output=ElasticSearcher.OUT_DOC) q = Query(operator=serializer.validated_data['operator']) # if input is string, convert to list # if unknown format, return error match_text = serializer.validated_data['match_text'] if isinstance(match_text, list): match_texts = [str(item) for item in match_text if item] elif isinstance(match_text, str): match_texts = [match_text] else: return Response({'error': f'match text is in unknown format: {match_text}'}, status=status.HTTP_400_BAD_REQUEST) # add query filters for item in match_texts: q.add_string_filter(item, match_type=serializer.validated_data["match_type"]) # update query es.update_query(q.query) # retrieve results results = es.search(size=serializer.validated_data["size"]) return Response(results, status=status.HTTP_200_OK)
def post(self, request, project_pk: int): """Executes **raw** Elasticsearch query on all project indices.""" project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) serializer = ProjectSearchByQuerySerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"]) if not indices: raise ProjectValidationFailed(detail="No indices supplied and project has no indices") es = None if serializer.validated_data["output_type"]: es = ElasticSearcher(indices=indices, output=serializer.validated_data["output_type"]) else: es = ElasticSearcher(indices=indices, output=ElasticSearcher.OUT_DOC_WITH_TOTAL_HL_AGGS) es.update_query(serializer.validated_data["query"]) results = es.search() return Response(results, status=status.HTTP_200_OK)