def post(self, request, pk: int, index: str, document_id: str): validate_index_and_project_perms(request, pk, index) serializer = self.get_serializer(data=request.data) if serializer.is_valid(raise_exception=True): ed = ElasticDocument(index) document = ed.get(document_id, fields=[TEXTA_TAGS_KEY]) if not document: raise NotFound( f"Could not find document with ID '{document_id}' from index '{index}'!" ) document = document.get("_source") target_facts = serializer.validated_data.get("facts", []) existing_facts = document.get(TEXTA_TAGS_KEY, []) new_facts = [] for index_count, existing_fact in enumerate(existing_facts): for fact in target_facts: if not (fact.items() <= existing_fact.items()): new_facts.append(existing_fact) document[TEXTA_TAGS_KEY] = new_facts ed.update(index, document_id, doc=document) return Response({ "message": f"Removed given facts from document with the ID of {document_id}!" })
def apply_mlp_on_es_docs(self, source_and_meta_docs: List[str], mlp_id: int): """ Applies MLP on documents received by previous tasks and updates them in Elasticsearch. :param self: Reference to the Celery Task object of this task, courtesy of the bind parameter in the decorator. :param source_and_meta_docs: List of Elasticsearch document ID's to pull from Elasticsearch. :param mlp_id: ID of the MLPObject which contains progress. """ mlp_object = get_mlp_object(mlp_id) task_object = mlp_object.task # Get the necessary fields. field_data: List[str] = json.loads(mlp_object.fields) if TEXTA_TAGS_KEY not in field_data: # Add in existing facts so that proper duplicate filtering would be applied. field_data.append(TEXTA_TAGS_KEY) analyzers: List[str] = json.loads(mlp_object.analyzers) # retrieve document from ES document_wrapper = ElasticDocument(index=None) source_and_meta_docs = document_wrapper.get_bulk( doc_ids=source_and_meta_docs, fields=field_data) source_documents = [doc["_source"] for doc in source_and_meta_docs] mlp_docs = apply_mlp_on_documents(source_documents, analyzers, field_data, mlp_id) es_documents = unite_source_with_meta(source_and_meta_docs, mlp_docs) update_documents_in_es(es_documents) # Update progress task_object.update_progress_iter(len(source_and_meta_docs)) return True
def post(self, request, pk: int): ed = ElasticDocument(index=None) # Validate payload and project permissions. serializer: InsertDocumentsSerializer = self.get_serializer(data=request.data) serializer.is_valid(raise_exception=True) project = get_object_or_404(Project, pk=pk) if project.users.filter(pk=request.user.pk).exists() is False: raise PermissionDenied("You do not have permissions for this project!") # Split indices on whether they have index access or lack any index details at all. documents = serializer.validated_data["documents"] split_fields = serializer.validated_data["split_text_in_fields"] indices = project.get_indices() correct_actions, failed_actions, missing_actions = self._split_documents_per_index(allowed_indices=indices, documents=documents) missing_actions, index_name, has_new_index = self._normalize_missing_index_values(missing_actions, project.pk, indices) split_actions = self._split_text(correct_actions + missing_actions, split_fields) if has_new_index: ed.core.create_index(index_name) ed.core.add_texta_facts_mapping(index_name) index, is_created = Index.objects.get_or_create(name=index_name, is_open=True) project.indices.add(index) # Send the documents to Elasticsearch. success_count, errors = ed.bulk_add_generator(actions=split_actions, stats_only=False) return Response( { "successfully_indexed": success_count, "errors": errors, "failed_index_permissions": len(failed_actions) } )
def get(self, request, pk: int, index: str, document_id: str): validate_index_and_project_perms(request, pk, index) ed = ElasticDocument(index) document = ed.get(document_id) if not document: raise NotFound(f"Could not find document with ID '{document_id}' from index '{index}'!") return Response(document)
def fact_delete_query_task(self, worker_id: int): worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id) try: show_progress = ShowProgress(worker_object.task, multiplier=1) show_progress.update_step( 'Scrolling through the indices to delete the facts.') # Get the necessary fields. indices: List[str] = worker_object.get_indices() target_facts = json.loads(worker_object.facts) scroll_size = worker_object.scroll_size searcher = ElasticSearcher( query=json.loads(worker_object.query), indices=indices, field_data=[TEXTA_TAGS_KEY], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=f"{worker_object.es_timeout}m") ed = ElasticDocument(index=None) actions = query_delete_actions_generator(searcher, target_facts) ed.bulk_update(actions) worker_object.task.complete() worker_object.save() return worker_id except Exception as e: worker_object.task.handle_failed_task(e) raise e
def get_significant_words(indices: List[str], fields: List[str], document_ids: List[str], stop_words: List = None, exclude=""): """ This is a helper function to parse all the given fields and use the document_ids as input to make a significant_words aggregation. Args: exclude: Regex compatible string for which words to exclude, uses the exclude parameter of Elasticsearch aggregations. stop_words: Optional parameter to remove stopwords from the results. indices: Indices from which to perform the aggregation. fields: From which fields can you get the text content needed for comparison. document_ids: IDs of the documents you want to use as baseline for the aggregation. Returns: List of dictionaries with the signifcant word and how many times it occurs in the documents. """ ed = ElasticDocument("*") ea = ElasticAggregator(indices=indices) stop_words = StopWords._get_stop_words(custom_stop_words=stop_words) # Validate that those documents exist. validated_docs: List[dict] = ed.get_bulk(document_ids) if validated_docs: unique_ids = list(set([index["_id"] for index in validated_docs])) significant_words = [] for field in fields: sw = ea.get_significant_words(document_ids=unique_ids, field=field, stop_words=stop_words, exclude=exclude) significant_words += sw return significant_words else: return []
def update_documents_in_es(documents: List[dict]): """ Updates the documents inside Elasticsearch, either with the MLP results or the error messages. :param documents: Full Elasticsearch documents.. """ ed = ElasticDocument(index=None) ed.bulk_update(actions=documents)
def delete(self, request, pk: int, index: str, document_id: str): validate_index_and_project_perms(request, pk, index) try: ed = ElasticDocument(index) document = ed.delete(doc_id=document_id) return Response(document) except texta_elastic.exceptions.NotFoundError: return Response(status=status.HTTP_404_NOT_FOUND)
def tag_cluster(self, cluster_pk: int, clustering_object_pk: int, fact: dict): ed = ElasticDocument("") cluster = Cluster.objects.get(pk=cluster_pk) clustering_object = ClusteringResult.objects.get(pk=clustering_object_pk) doc_ids = json.loads(cluster.document_ids) ignored_ids = json.loads(clustering_object.ignored_ids) ed.add_fact_to_documents(fact=fact, doc_ids=doc_ids) clustering_object.ignored_ids = json.dumps(doc_ids + ignored_ids) clustering_object.save() return True
def patch(self, request, pk: int, index: str, document_id: str): validate_index_and_project_perms(request, pk, index) try: ed = ElasticDocument(index) document = ed.update(index=index, doc_id=document_id, doc=request.data) return Response(document) except elasticsearch.exceptions.RequestError as e: if e.error == "mapper_parsing_exception": # TODO Extend the decorator with different variants of the request error instead. return Response(e.info["error"]["root_cause"], status=status.HTTP_400_BAD_REQUEST) except texta_elastic.exceptions.NotFoundError: return Response(status=status.HTTP_404_NOT_FOUND)
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str], fields: List[str], query: dict, es_timeout: int, bulk_size: int, fact_name: str, add_spans: bool): """Apply Rakun Keyword Extractor to index.""" logging.getLogger(INFO_LOGGER).info( f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!" ) rakun_extractor_object = RakunExtractor.objects.get(id=object_id) try: progress = ShowProgress(rakun_extractor_object.task) # retrieve fields field_data = fields ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=field_data + ["texta_facts"], # Get facts to add upon existing ones. query=query, timeout=f"{es_timeout}m", output=ElasticSearcher.OUT_RAW, callback_progress=progress, scroll_size=bulk_size) keyword_detector = rakun_extractor_object.load_rakun_keyword_detector() actions = update_generator( keyword_detector=keyword_detector, generator=searcher, ec=ec, fields=field_data, rakun_extractor_object=rakun_extractor_object, fact_name=fact_name, fact_value="", add_spans=add_spans) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) rakun_extractor_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. rakun_extractor_object.task.add_error(error_message) rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
def bulk_add_documents( elastic_search: ElasticSearcher, elastic_doc: ElasticDocument, index: str, chunk_size: int, field_data: List[dict], flatten_doc=False, ): new_docs = apply_custom_processing(elastic_search, flatten_doc) actions = apply_field_changes_generator(new_docs, index, field_data) # No need to wait for indexing to actualize, hence refresh is False. elastic_doc.bulk_add_generator(actions=actions, chunk_size=chunk_size, refresh="wait_for")
def pull_document_by_id(self, request, pk=None, project_pk=None): annotator: Annotator = self.get_object() serializer = self.get_serializer(data=request.data) serializer.is_valid(raise_exception=True) ed = ElasticDocument(index=annotator.get_indices()) document_id = serializer.validated_data["document_id"] document = ed.get(document_id) if document: document = self._process_document_output(document, annotator) return Response(document) else: return Response({"message": "No such document!"}, status=status.HTTP_404_NOT_FOUND)
def apply_search_fields_tagger_on_index(object_id: int): search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id) task_object = search_fields_tagger.task """Apply Search Fields Tagger to index.""" try: progress = ShowProgress(task_object) progress.update_step('scrolling search fields') # Get the necessary fields. indices: List[str] = search_fields_tagger.get_indices() fields: List[str] = json.loads(search_fields_tagger.fields) fact_name: str = search_fields_tagger.fact_name scroll_timeout = search_fields_tagger.es_timeout scroll_size = search_fields_tagger.bulk_size use_breakup = search_fields_tagger.use_breakup breakup_character = search_fields_tagger.breakup_character ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=fields + ["texta_facts"], # Get facts to add upon existing ones. query=json.loads(search_fields_tagger.query), output=ElasticSearcher.OUT_RAW, scroll_timeout=f"{scroll_timeout}m", callback_progress=progress, scroll_size=scroll_size) actions = update_search_fields_generator( generator=searcher, ec=ec, fields=fields, fact_name=fact_name, search_field_tagger_object=search_fields_tagger, use_breakup=use_breakup, breakup_character=breakup_character) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return object_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def apply_analyzers_on_indices(self, worker_id: int): worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id) task_object = worker_object.task try: show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() fields = json.loads(worker_object.fields) detect_lang = worker_object.detect_lang snowball_language = worker_object.stemmer_lang scroll_timeout = f"{worker_object.es_timeout}m" scroll_size = worker_object.bulk_size analyzers = json.loads(worker_object.analyzers) tokenizer = worker_object.tokenizer strip_html = worker_object.strip_html searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=fields, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=scroll_timeout) task_object.set_total(searcher.count()) actions = process_analyzer_actions(generator=searcher, worker=worker_object, detect_lang=detect_lang, snowball_language=snowball_language, fields_to_parse=fields, analyzers=analyzers, tokenizer=tokenizer, strip_html=strip_html) # Send the data towards Elasticsearch ed = ElasticDocument("_all") ed.bulk_update(actions=actions, chunk_size=scroll_size) worker_object.task.complete() return worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def import_dataset(self) -> list: error_container = [] # retrieve content from file success, file_content = self._get_file_content() file_content = file_content.dropna(how="all") # check if file was parsed if not success: error_container.append('unknown file type') return error_container # convert content to list of records (dicts) records = file_content.to_dict(orient='records') # set num_records self.num_records = len(records) # set total number of documents for progress if self.show_progress: self.show_progress.set_total(self.num_records) # add documents to ES es_doc = ElasticDocument(self.index) # create index es_doc.core.create_index(self.index) # add mapping for texta facts es_doc.core.add_texta_facts_mapping(self.index) # get records chunk_size = 500 records = [{ k: v for k, v in record.items() if pd.Series(v).notna().all() } for record in records] record_chunks = list(chunks(records, chunk_size)) for documents in record_chunks: success, errors = es_doc.bulk_add(documents, chunk_size=chunk_size, stats_only=False, raise_on_error=False) self.num_records_success += success if self.show_progress: self.show_progress.update(success) for error in list(errors): message = error["index"]["error"]["reason"] if isinstance( error, dict) else str(error) error_container.append(message) return error_container
def multitag_docs(self, request, pk=None, project_pk=None): serializer = self.get_serializer(data=request.data) serializer.is_valid(raise_exception=True) # filter tagger groups present in project project_object = Project.objects.get(id=project_pk) regex_taggers_groups = RegexTaggerGroup.objects.filter( project=project_object) # filter again based on serializer if serializer.validated_data['tagger_groups']: regex_taggers_groups = regex_taggers_groups.filter( pk__in=serializer.validated_data['tagger_groups']) docs = serializer.validated_data["docs"] fields = serializer.validated_data["fields"] # apply taggers result = [] for regex_tagger_group in regex_taggers_groups: matches = regex_tagger_group.tag_docs(fields, docs) result.extend(matches) result = ElasticDocument.remove_duplicate_facts(result) return Response(result, status=status.HTTP_200_OK)
def _initialize_es(self, project_pk, text_processor, callback_progress, prediction_to_match): # create es doc es_doc = ElasticDocument(self.feedback_index) # if no model objects, return nones for query and search if not self.model_object: return es_doc, None, None # create mathing query query = Query() query.add_string_filter(query_string=self.model_object.MODEL_TYPE, fields=["model_type"]) if self.model_object: query.add_string_filter(query_string=str(self.model_object.pk), fields=["model_id"]) if prediction_to_match: query.add_string_filter(query_string=prediction_to_match, fields=["correct_result"]) # if no index, don't create searcher object if not self.check_index_exists(): return es_doc, None, query.query # create es search es_search = ElasticSearcher(indices=self.feedback_index, query=query.query, text_processor=text_processor, output=ElasticSearcher.OUT_DOC_WITH_ID, callback_progress=callback_progress) # return objects return es_doc, es_search, query.query
def _get_sample_document(self, id_field: str, id_value: str, index: str): query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict() es = ElasticSearcher(query=query, output=ElasticSearcher.OUT_RAW) ed = ElasticDocument(index=index) response = es.search()["hits"]["hits"] document = response[0] if response else None return ed, document
def setUp(self): self.user = create_test_user('user', '*****@*****.**', 'pw') self.new_test_index_name = f"ttk_test_query_tagger_{uuid.uuid4().hex[:5]}" self.project = project_creation("SearchQueryTaggerTestProject", self.new_test_index_name, self.user) self.project.users.add(self.user) self.url = reverse("v2:search_query_tagger-list", kwargs={"project_pk": self.project.pk}) self.uuid = "adasda-5874856a-das4das98f5" self.document = { "Field_1": "This is sentence1. This is sentence2. This is sentence3. This is sentence4. This is sentence5.", "Field_2": "This is a different sentence.", "Field_3": "This is test data.", "uuid": self.uuid} self.ed = ElasticDocument(index=self.new_test_index_name) self.ed.add(self.document) self.client.login(username='******', password='******')
def count_indices(self, request, pk=None, project_pk=None): serializer = self.get_serializer(data=request.data) serializer.is_valid() indices = [{"name": name} for name in serializer.validated_data.get("indices", [])] serializer = IndexSerializer(data=indices, many=True) serializer.is_valid(raise_exception=True) project: Project = self.get_object() ed = ElasticDocument(None) indices = [index["name"] for index in indices] if indices: # We check for indices before to prevent the default behaviour of picking all the indices in project. indices = project.get_available_or_all_project_indices(indices) count = ed.count(indices=indices) return Response(count) else: return Response(0)
def post(self, request, project_pk: int): project: Project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) serializer = ProjectDocumentSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"]) if not indices: raise ProjectValidationFailed(detail="No indices supplied and project has no indices") doc_id = serializer.validated_data["doc_id"] if not doc_id: raise InvalidInputDocument(detail="No doc_id supplied") es = ElasticDocument(index=indices) results = es.get(doc_id) return Response(results, status=status.HTTP_200_OK)
def process_mlp_actions(generator: ElasticSearcher, analyzers: List[str], field_data: List[str], mlp_class: MLP, mlp_id: int): """ ElasticSearcher returns a list of 100 RAW elastic documents. Since MLP needs a raw document to process, we need to memorize the index of the document in question so that we could later fetch it's metadata for the Bulk generator. """ counter = 0 info_logger = logging.getLogger(INFO_LOGGER) info_logger.info( f"Starting the processing of indices for MLP worker with ID of {mlp_id}!" ) for document_batch in generator: document_sources = [dict(hit["_source"]) for hit in document_batch] mlp_processed = mlp_class.process_docs(document_sources, analyzers=analyzers, doc_paths=field_data) for index, mlp_processed_document in enumerate(mlp_processed): original_elastic_document = document_batch[index] # Make sure that existing facts in the document and new ones don't overlap. original_facts = original_elastic_document["_source"].get( "texta_facts", []) new_facts = mlp_processed_document.get("texta_facts", []) total_facts = [fact for fact in original_facts + new_facts if fact] unique_facts = ElasticDocument.remove_duplicate_facts(total_facts) elastic_update_body = { "_id": original_elastic_document["_id"], "_index": original_elastic_document["_index"], "_type": original_elastic_document.get("_type", "_doc"), "_op_type": "update", "doc": { **mlp_processed_document, **{ "texta_facts": unique_facts } } } yield elastic_update_body counter += 1 progress = generator.callback_progress if counter % generator.scroll_size == 0: info_logger.info( f"Progress on applying MLP for worker with id: {mlp_id} at {counter} out of {progress.n_total} documents!" ) elif counter == progress.n_total: info_logger.info( f"Finished applying MLP for worker with id: {mlp_id} at {counter}/{progress.n_total} documents!" )
def apply_lang_on_indices(self, apply_worker_id: int): worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id) task_object = worker_object.task try: load_mlp() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() field = worker_object.field scroll_size = 100 searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=[field], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="15m") for index in indices: searcher.core.add_texta_facts_mapping(index=index) actions = process_lang_actions(generator=searcher, field=field, worker_id=apply_worker_id, mlp_class=mlp) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) worker_object.task.complete() return apply_worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def apply_summarizer_on_index(self, summarizer_id: int): summarizer_object = Summarizer.objects.get(pk=summarizer_id) task_object = summarizer_object.task try: load_sumy() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step('scrolling summarizer') # Get the necessary fields. indices: List[str] = summarizer_object.get_indices() field_data: List[str] = json.loads(summarizer_object.fields) ratio_data: float[str] = summarizer_object.ratio algorithm_data: List[str] = summarizer_object.algorithm scroll_size = 100 searcher = ElasticSearcher(query=json.loads(summarizer_object.query), indices=indices, field_data=field_data, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="30m") actions = process_actions(searcher, field_data, ratio_data, algorithm=algorithm_data, summarizer_class=sumy, summarizer_id=summarizer_id) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return summarizer_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def update_generator(generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, fact_value: str, tagger_object: BertTaggerObject, tagger: BertTagger = None): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Appyling BERT Tagger with ID {tagger_object.id} to batch {i + 1}..." ) for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): result = tagger_object.apply_loaded_tagger( tagger, text, input_type="text", feedback=False) # If tagger is binary and fact value is not specified by the user, use tagger description as fact value if result["result"] in ["true", "false"]: if not fact_value: fact_value = tagger_object.description # For multitag, use the prediction as fact value else: fact_value = result["result"] new_facts = to_texta_facts(result, field, fact_name, fact_value) existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }
def retrieve(self, request, *args, **kwargs): # API v1 to v2 compliance if "clustering_pk" in self.kwargs: topic_analyzer_pk = self.kwargs["clustering_pk"] elif "topic_analyzer_pk" in self.kwargs: topic_analyzer_pk = self.kwargs["topic_analyzer_pk"] queryset = Cluster.objects.filter( clusteringresult__project__pk=self.kwargs["project_pk"], clusteringresult__pk=topic_analyzer_pk) cluster = get_object_or_404(queryset, pk=self.kwargs["pk"]) doc_ids = json.loads(cluster.document_ids) fields = json.loads(cluster.fields) indices = json.loads(cluster.indices) significant_words = json.loads(cluster.significant_words) display_fields = json.loads(cluster.display_fields) if display_fields: fields += display_fields ed = ElasticDocument(index=",".join(indices)) documents = ed.get_bulk(doc_ids, flatten=True) documents = documents if documents else [] documents = [{ "id": doc["_id"], "index": doc["_index"], "content": doc["_source"] } for doc in documents] formated_cluster = { "id": cluster.pk, "intracluster_similarity": cluster.intracluster_similarity, "document_count": cluster.get_document_count(), "significant_words": significant_words, "documents": documents } return Response(formated_cluster)
def update(self, request, *args, **kwargs): serializer = ClusterSerializer(data=request.data, partial=True) serializer.is_valid() cluster = Cluster.objects.get(pk=kwargs["pk"]) clustering_object = ClusteringResult.objects.get( pk=kwargs["topic_analyzer_pk"]) fields = json.loads(cluster.fields) stop_words = json.loads(clustering_object.stop_words) indices = json.loads(cluster.indices) if "document_ids" in serializer.validated_data: document_ids = serializer.validated_data["document_ids"] ed = ElasticDocument("*") # Validate that those documents exist. validated_docs = ed.get_bulk(document_ids) if validated_docs: unique_ids = list( set([index["_id"] for index in validated_docs])) cluster.document_ids = json.dumps(unique_ids) sw = Cluster.get_significant_words(indices=indices, fields=fields, document_ids=unique_ids, stop_words=stop_words) cluster.significant_words = json.dumps(sw) cluster_content = ClusterContent( unique_ids, vectors_filepath=clustering_object.vector_model.name) cluster.intracluster_similarity = cluster_content.get_intracluster_similarity( ) else: cluster.document_ids = json.dumps([]) cluster.save() return Response({"message": "Cluster has been updated successfully!"})
def skip_document(self, request, pk=None, project_pk=None): serializer: DocumentIDSerializer = self.get_serializer( data=request.data) serializer.is_valid(raise_exception=True) annotator: Annotator = self.get_object() ed = ElasticDocument(index=annotator.get_indices()) document_id = serializer.validated_data["document_id"] document = ed.get(document_id) texta_annotations = document["_source"].get("texta_annotator", []) processed_timestamp = None if texta_annotations: for texta_annotation in texta_annotations: processed_timestamp = texta_annotation.get( "processed_timestamp_utc", None) if processed_timestamp: return Response({ "detail": f"Document with ID: {serializer.validated_data['document_id']} is already annotated" }) annotator.skip_document(serializer.validated_data["document_id"], serializer.validated_data["index"], user=request.user) return Response({ "detail": f"Skipped document with ID: {serializer.validated_data['document_id']}" }) else: annotator.skip_document(serializer.validated_data["document_id"], serializer.validated_data["index"], user=request.user) return Response({ "detail": f"Skipped document with ID: {serializer.validated_data['document_id']}" })
def setUp(self): self.user = create_test_user('user', '*****@*****.**', 'pw') self.index_uuid = uuid.uuid4().hex[:5] self.new_test_index_name = f"ttk_test_fields_tagger_{self.index_uuid}" self.ed = ElasticDocument(index=self.new_test_index_name) self.ed.core.es.indices.create(index=self.new_test_index_name, ignore=[400, 404]) self.project = project_creation("SearchFieldsTaggerTestProject", self.new_test_index_name, self.user) self.project.users.add(self.user) self.url = reverse(f"{VERSION_NAMESPACE}:search_fields_tagger-list", kwargs={"project_pk": self.project.pk}) self.uuid = uuid.uuid4().hex[:10] self.document = { "Field_1": "This is sentence1. This is sentence2. This is sentence3. This is sentence4. This is sentence5.", "Field_2": "This is a different sentence.", "Field_3": "This is test data.", "newline_break": "olgu\nõnnistatud\npüha\nkäsikranaat", "array_break": ["olgu", "õnnistatud", "püha", "käsikranaat"], "uuid": self.uuid } self.ed.add(self.document) self.client.login(username='******', password='******')