def start_fact_delete_query_task(self, worker_id: int): """ Scrolls the document ID-s and passes them to MLP worker. """ worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id) try: logging.getLogger(INFO_LOGGER).info( f"Celery: Starting task for deleting facts by query for project with ID: {worker_object.pk}" ) # init progress show_progress = ShowProgress(worker_object.task, multiplier=1) show_progress.update_step('Scrolling document IDs') show_progress.update_view(0) # create searcher object for scrolling ids searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=worker_object.get_indices(), output=ElasticSearcher.OUT_DOC, callback_progress=show_progress, scroll_size=worker_object.scroll_size, field_data=["texta_facts"]) count = searcher.count() show_progress.update_step(f'Deleting facts from {count} documents') show_progress.update_view(0) worker_object.task.set_total(count) return True except Exception as e: worker_object.task.handle_failed_task(e) raise e
def start_mlp_worker(self, mlp_id: int): """ Scrolls the document ID-s and passes them to MLP worker. """ mlp_object = MLPWorker.objects.get(pk=mlp_id) try: logging.getLogger(INFO_LOGGER).info( f"Applying mlp on the index for MLP Task ID: {mlp_id}") # init progress show_progress = ShowProgress(mlp_object.task, multiplier=1) show_progress.update_step('Scrolling document IDs') show_progress.update_view(0) # Get the necessary fields. indices: List[str] = mlp_object.get_indices() es_scroll_size = mlp_object.es_scroll_size es_timeout = mlp_object.es_timeout # create searcher object for scrolling ids searcher = ElasticSearcher(query=json.loads(mlp_object.query), indices=indices, output=ElasticSearcher.OUT_META, callback_progress=show_progress, scroll_size=es_scroll_size, scroll_timeout=f"{es_timeout}m") # add texta facts mappings to the indices if needed for index in indices: searcher.core.add_texta_facts_mapping(index=index) doc_chunks = list(chunks_iter(searcher, MLP_BATCH_SIZE)) # update progress show_progress.update_step( f'Applying MLP to {len(doc_chunks)} documents') show_progress.update_view(0) mlp_object.task.set_total(searcher.count()) mlp_object.task.update_status(Task.STATUS_RUNNING) # pass document id-s to the next task chain = group( apply_mlp_on_es_docs.s([doc["_id"] for doc in meta_chunk], mlp_id) for meta_chunk in doc_chunks) | end_mlp_task.si(mlp_id) chain.delay() return True except Exception as e: mlp_object.task.handle_failed_task(e) raise
def apply_analyzers_on_indices(self, worker_id: int): worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id) task_object = worker_object.task try: show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() fields = json.loads(worker_object.fields) detect_lang = worker_object.detect_lang snowball_language = worker_object.stemmer_lang scroll_timeout = f"{worker_object.es_timeout}m" scroll_size = worker_object.bulk_size analyzers = json.loads(worker_object.analyzers) tokenizer = worker_object.tokenizer strip_html = worker_object.strip_html searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=fields, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=scroll_timeout) task_object.set_total(searcher.count()) actions = process_analyzer_actions(generator=searcher, worker=worker_object, detect_lang=detect_lang, snowball_language=snowball_language, fields_to_parse=fields, analyzers=analyzers, tokenizer=tokenizer, strip_html=strip_html) # Send the data towards Elasticsearch ed = ElasticDocument("_all") ed.bulk_update(actions=actions, chunk_size=scroll_size) worker_object.task.complete() return worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def evaluate_entity_tags_task(object_id: int, indices: List[str], query: dict, es_timeout: int = 10, scroll_size: int = 100): try: logging.getLogger(INFO_LOGGER).info( f"Starting entity evaluator task for Evaluator with ID {object_id}." ) evaluator_object = Evaluator.objects.get(pk=object_id) progress = ShowProgress(evaluator_object.task, multiplier=1) true_fact = evaluator_object.true_fact pred_fact = evaluator_object.predicted_fact add_misclassified_examples = evaluator_object.add_misclassified_examples token_based = evaluator_object.token_based # If the user hasn't defined a field, retrieve it automatically if not evaluator_object.field: es_aggregator = ElasticAggregator(indices=indices, query=deepcopy(query)) true_fact_doc_paths = es_aggregator.facts_abstract( key_field="fact", value_field="doc_path", filter_by_key=true_fact) doc_path = true_fact_doc_paths[0] else: doc_path = evaluator_object.field searcher = ElasticSearcher(indices=indices, field_data=[doc_path, "texta_facts"], query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=scroll_size) # Get number of documents n_docs = searcher.count() evaluator_object.task.total = n_docs evaluator_object.task.save() evaluator_object.document_count = n_docs evaluator_object.scores_imprecise = False evaluator_object.score_after_scroll = False evaluator_object.add_individual_results = False # Save model updates evaluator_object.save() # Get number of batches for the logger n_batches = math.ceil(n_docs / scroll_size) scores, misclassified = scroll_and_score_entity( searcher, evaluator_object, true_fact, pred_fact, doc_path, token_based, n_batches, add_misclassified_examples) logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}") for conn in connections.all(): conn.close_if_unusable_or_obsolete() # Generate confusion matrix plot and save it image_name = f"{secrets.token_hex(15)}.png" classes = ["other", true_fact] evaluator_object.plot.save(image_name, create_confusion_plot( scores["confusion_matrix"], classes), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name evaluator_object.plot.name = str(image_path) evaluator_object.save() evaluator_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. evaluator_object.task.add_error(error_message) evaluator_object.task.update_status(Task.STATUS_FAILED)
def evaluate_tags_task(object_id: int, indices: List[str], query: dict, es_timeout: int = 10, scroll_size: int = 100): try: logging.getLogger(INFO_LOGGER).info( f"Starting evaluator task for Evaluator with ID {object_id}.") evaluator_object = Evaluator.objects.get(pk=object_id) progress = ShowProgress(evaluator_object.task, multiplier=1) # Retreieve facts and sklearn average function from the model true_fact = evaluator_object.true_fact pred_fact = evaluator_object.predicted_fact true_fact_value = evaluator_object.true_fact_value pred_fact_value = evaluator_object.predicted_fact_value average = evaluator_object.average_function add_individual_results = evaluator_object.add_individual_results searcher = ElasticSearcher(indices=indices, field_data=["texta_facts"], query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=scroll_size) # Binary if true_fact_value and pred_fact_value: logging.getLogger(INFO_LOGGER).info( f"Starting binary evaluation. Comparing following fact and fact value pairs: TRUE: ({true_fact}: {true_fact_value}), PREDICTED: ({pred_fact}: {pred_fact_value})." ) # Set the evaluation type in the model evaluator_object.evaluation_type = "binary" true_set = {true_fact_value, "other"} pred_set = {pred_fact_value, "other"} classes = ["other", true_fact_value] n_total_classes = len(classes) # Multilabel/multiclass else: logging.getLogger(INFO_LOGGER).info( f"Starting multilabel evaluation. Comparing facts TRUE: '{true_fact}', PRED: '{pred_fact}'." ) # Make deepcopy of the query to avoid modifying Searcher's query. es_aggregator = ElasticAggregator(indices=indices, query=deepcopy(query)) # Get all fact values corresponding to true and predicted facts to construct total set of labels # needed for confusion matrix, individual score calculations and memory imprint calculations true_fact_values = es_aggregator.facts( size=choices.DEFAULT_MAX_AGGREGATION_SIZE, filter_by_fact_name=true_fact) pred_fact_values = es_aggregator.facts( size=choices.DEFAULT_MAX_AGGREGATION_SIZE, filter_by_fact_name=pred_fact) true_set = set(true_fact_values) pred_set = set(pred_fact_values) classes = list(true_set.union(pred_set)) n_total_classes = len(classes) # Add dummy classes for missing labels classes.extend( [choices.MISSING_TRUE_LABEL, choices.MISSING_PRED_LABEL]) ## Set the evaluation type in the model evaluator_object.evaluation_type = "multilabel" classes.sort(key=lambda x: x[0].lower()) # Get number of documents in the query to estimate memory imprint n_docs = searcher.count() evaluator_object.task.total = n_docs evaluator_object.task.save() logging.getLogger(INFO_LOGGER).info( f"Number of documents: {n_docs} | Number of classes: {len(classes)}" ) # Get the memory buffer value from core variables core_memory_buffer_value_gb = get_core_setting( "TEXTA_EVALUATOR_MEMORY_BUFFER_GB") # Calculate the value based on given ratio if the core variable is empty memory_buffer_gb = calculate_memory_buffer( memory_buffer=core_memory_buffer_value_gb, ratio=EVALUATOR_MEMORY_BUFFER_RATIO, unit="gb") required_memory = get_memory_imprint( n_docs=n_docs, n_classes=len(classes), eval_type=evaluator_object.evaluation_type, unit="gb", int_size=64) enough_memory = is_enough_memory_available( required_memory=required_memory, memory_buffer=memory_buffer_gb, unit="gb") # Enable scoring after each scroll if there isn't enough memory # for calculating the scores for the whole set of documents at once. score_after_scroll = False if enough_memory else True # If scoring after each scroll is enabled and scores are averaged after each scroll # the results for each averaging function besides `micro` are imprecise scores_imprecise = True if (score_after_scroll and average != "micro") else False # Store document counts, labels' class counts and indicatior if scores are imprecise evaluator_object.document_count = n_docs evaluator_object.n_true_classes = len(true_set) evaluator_object.n_predicted_classes = len(pred_set) evaluator_object.n_total_classes = n_total_classes evaluator_object.scores_imprecise = scores_imprecise evaluator_object.score_after_scroll = score_after_scroll # Save model updates evaluator_object.save() logging.getLogger(INFO_LOGGER).info( f"Enough available memory: {enough_memory} | Score after scroll: {score_after_scroll}" ) # Get number of batches for the logger n_batches = math.ceil(n_docs / scroll_size) # Scroll and score tags scores, bin_scores = scroll_and_score( generator=searcher, evaluator_object=evaluator_object, true_fact=true_fact, pred_fact=pred_fact, true_fact_value=true_fact_value, pred_fact_value=pred_fact_value, classes=classes, average=average, score_after_scroll=score_after_scroll, n_batches=n_batches, add_individual_results=add_individual_results) logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}") for conn in connections.all(): conn.close_if_unusable_or_obsolete() confusion = scores["confusion_matrix"] confusion = np.asarray(confusion, dtype="int64") if len(classes) <= choices.DEFAULT_MAX_CONFUSION_CLASSES: # Delete empty rows and columns corresponding to missing pred/true labels from the confusion matrix confusion, classes = delete_empty_rows_and_cols(confusion, classes) scores["confusion_matrix"] = confusion.tolist() # Generate confusion matrix plot and save it image_name = f"{secrets.token_hex(15)}.png" evaluator_object.plot.save(image_name, create_confusion_plot( scores["confusion_matrix"], classes), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name evaluator_object.plot.name = str(image_path) # Add final scores to the model evaluator_object.precision = scores["precision"] evaluator_object.recall = scores["recall"] evaluator_object.f1_score = scores["f1_score"] evaluator_object.accuracy = scores["accuracy"] evaluator_object.confusion_matrix = json.dumps( scores["confusion_matrix"]) evaluator_object.individual_results = json.dumps( remove_not_found(bin_scores), ensure_ascii=False) evaluator_object.add_misclassified_examples = False evaluator_object.save() evaluator_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. evaluator_object.task.add_error(error_message) evaluator_object.task.update_status(Task.STATUS_FAILED)
def reindex_task(reindexer_task_id: int): logging.getLogger(INFO_LOGGER).info( f"Starting task 'reindex' with ID {reindexer_task_id}.") try: reindexer_obj = Reindexer.objects.get(pk=reindexer_task_id) task_object = reindexer_obj.task indices = json.loads(reindexer_obj.indices) fields = json.loads(reindexer_obj.fields) random_size = reindexer_obj.random_size field_type = json.loads(reindexer_obj.field_type) scroll_size = reindexer_obj.scroll_size new_index = reindexer_obj.new_index query = json.loads(reindexer_obj.query) # if no fields, let's use all fields from all selected indices if not fields: fields = ElasticCore().get_fields(indices) fields = [field["path"] for field in fields] show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step("scrolling data") show_progress.update_view(0) elastic_search = ElasticSearcher(indices=indices, field_data=fields, callback_progress=show_progress, query=query, scroll_size=scroll_size) task_object.set_total(elastic_search.count()) elastic_doc = ElasticDocument(new_index) if random_size > 0: elastic_search = elastic_search.random_documents(size=random_size) logging.getLogger(INFO_LOGGER).info("Updating index schema.") ''' the operations that don't require a mapping update have been completed ''' schema_input = update_field_types(indices, fields, field_type, flatten_doc=FLATTEN_DOC) updated_schema = update_mapping(schema_input, new_index, reindexer_obj.add_facts_mapping, add_texta_meta_mapping=False) logging.getLogger(INFO_LOGGER).info("Creating new index.") # create new_index create_index_res = ElasticCore().create_index(new_index, updated_schema) Index.objects.get_or_create(name=new_index) logging.getLogger(INFO_LOGGER).info("Indexing documents.") # set new_index name as mapping name, perhaps make it customizable in the future bulk_add_documents(elastic_search, elastic_doc, index=new_index, chunk_size=scroll_size, flatten_doc=FLATTEN_DOC, field_data=field_type) # declare the job done task_object.complete() logging.getLogger(INFO_LOGGER).info( "Reindexing succesfully completed.") return True except Exception as e: task_object.handle_failed_task(e) raise e