Esempio n. 1
0
def start_fact_delete_query_task(self, worker_id: int):
    """
    Scrolls the document ID-s and passes them to MLP worker.
    """
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Celery: Starting task for deleting facts by query for project with ID: {worker_object.pk}"
        )

        # init progress
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step('Scrolling document IDs')
        show_progress.update_view(0)

        # create searcher object for scrolling ids
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=worker_object.get_indices(),
                                   output=ElasticSearcher.OUT_DOC,
                                   callback_progress=show_progress,
                                   scroll_size=worker_object.scroll_size,
                                   field_data=["texta_facts"])

        count = searcher.count()

        show_progress.update_step(f'Deleting facts from {count} documents')
        show_progress.update_view(0)
        worker_object.task.set_total(count)
        return True

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Esempio n. 2
0
def start_mlp_worker(self, mlp_id: int):
    """
    Scrolls the document ID-s and passes them to MLP worker.
    """
    mlp_object = MLPWorker.objects.get(pk=mlp_id)

    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Applying mlp on the index for MLP Task ID: {mlp_id}")
        # init progress
        show_progress = ShowProgress(mlp_object.task, multiplier=1)
        show_progress.update_step('Scrolling document IDs')
        show_progress.update_view(0)
        # Get the necessary fields.
        indices: List[str] = mlp_object.get_indices()
        es_scroll_size = mlp_object.es_scroll_size
        es_timeout = mlp_object.es_timeout

        # create searcher object for scrolling ids
        searcher = ElasticSearcher(query=json.loads(mlp_object.query),
                                   indices=indices,
                                   output=ElasticSearcher.OUT_META,
                                   callback_progress=show_progress,
                                   scroll_size=es_scroll_size,
                                   scroll_timeout=f"{es_timeout}m")
        # add texta facts mappings to the indices if needed
        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        doc_chunks = list(chunks_iter(searcher, MLP_BATCH_SIZE))

        # update progress
        show_progress.update_step(
            f'Applying MLP to {len(doc_chunks)} documents')
        show_progress.update_view(0)

        mlp_object.task.set_total(searcher.count())
        mlp_object.task.update_status(Task.STATUS_RUNNING)

        # pass document id-s to the next task
        chain = group(
            apply_mlp_on_es_docs.s([doc["_id"] for doc in meta_chunk], mlp_id)
            for meta_chunk in doc_chunks) | end_mlp_task.si(mlp_id)
        chain.delay()
        return True

    except Exception as e:
        mlp_object.task.handle_failed_task(e)
        raise
Esempio n. 3
0
def apply_analyzers_on_indices(self, worker_id: int):
    worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id)
    task_object = worker_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        fields = json.loads(worker_object.fields)
        detect_lang = worker_object.detect_lang
        snowball_language = worker_object.stemmer_lang
        scroll_timeout = f"{worker_object.es_timeout}m"
        scroll_size = worker_object.bulk_size
        analyzers = json.loads(worker_object.analyzers)
        tokenizer = worker_object.tokenizer
        strip_html = worker_object.strip_html

        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=fields,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout=scroll_timeout)

        task_object.set_total(searcher.count())

        actions = process_analyzer_actions(generator=searcher,
                                           worker=worker_object,
                                           detect_lang=detect_lang,
                                           snowball_language=snowball_language,
                                           fields_to_parse=fields,
                                           analyzers=analyzers,
                                           tokenizer=tokenizer,
                                           strip_html=strip_html)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        ed.bulk_update(actions=actions, chunk_size=scroll_size)

        worker_object.task.complete()

        return worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Esempio n. 4
0
def evaluate_entity_tags_task(object_id: int,
                              indices: List[str],
                              query: dict,
                              es_timeout: int = 10,
                              scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting entity evaluator task for Evaluator with ID {object_id}."
        )

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact

        add_misclassified_examples = evaluator_object.add_misclassified_examples
        token_based = evaluator_object.token_based

        # If the user hasn't defined a field, retrieve it automatically
        if not evaluator_object.field:
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))
            true_fact_doc_paths = es_aggregator.facts_abstract(
                key_field="fact",
                value_field="doc_path",
                filter_by_key=true_fact)
            doc_path = true_fact_doc_paths[0]
        else:
            doc_path = evaluator_object.field

        searcher = ElasticSearcher(indices=indices,
                                   field_data=[doc_path, "texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Get number of documents
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        evaluator_object.document_count = n_docs
        evaluator_object.scores_imprecise = False
        evaluator_object.score_after_scroll = False
        evaluator_object.add_individual_results = False

        # Save model updates
        evaluator_object.save()

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        scores, misclassified = scroll_and_score_entity(
            searcher, evaluator_object, true_fact, pred_fact, doc_path,
            token_based, n_batches, add_misclassified_examples)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        classes = ["other", true_fact]
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)
Esempio n. 5
0
def evaluate_tags_task(object_id: int,
                       indices: List[str],
                       query: dict,
                       es_timeout: int = 10,
                       scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting evaluator task for Evaluator with ID {object_id}.")

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        # Retreieve facts and sklearn average function from the model
        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact
        true_fact_value = evaluator_object.true_fact_value
        pred_fact_value = evaluator_object.predicted_fact_value

        average = evaluator_object.average_function
        add_individual_results = evaluator_object.add_individual_results

        searcher = ElasticSearcher(indices=indices,
                                   field_data=["texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Binary
        if true_fact_value and pred_fact_value:
            logging.getLogger(INFO_LOGGER).info(
                f"Starting binary evaluation. Comparing following fact and fact value pairs: TRUE: ({true_fact}: {true_fact_value}), PREDICTED: ({pred_fact}: {pred_fact_value})."
            )

            # Set the evaluation type in the model
            evaluator_object.evaluation_type = "binary"

            true_set = {true_fact_value, "other"}
            pred_set = {pred_fact_value, "other"}

            classes = ["other", true_fact_value]
            n_total_classes = len(classes)

        # Multilabel/multiclass
        else:
            logging.getLogger(INFO_LOGGER).info(
                f"Starting multilabel evaluation. Comparing facts TRUE: '{true_fact}', PRED: '{pred_fact}'."
            )

            # Make deepcopy of the query to avoid modifying Searcher's query.
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))

            # Get all fact values corresponding to true and predicted facts to construct total set of labels
            # needed for confusion matrix, individual score calculations and memory imprint calculations
            true_fact_values = es_aggregator.facts(
                size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                filter_by_fact_name=true_fact)
            pred_fact_values = es_aggregator.facts(
                size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                filter_by_fact_name=pred_fact)

            true_set = set(true_fact_values)
            pred_set = set(pred_fact_values)

            classes = list(true_set.union(pred_set))
            n_total_classes = len(classes)

            # Add dummy classes for missing labels
            classes.extend(
                [choices.MISSING_TRUE_LABEL, choices.MISSING_PRED_LABEL])

            ## Set the evaluation type in the model
            evaluator_object.evaluation_type = "multilabel"

            classes.sort(key=lambda x: x[0].lower())

        # Get number of documents in the query to estimate memory imprint
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        logging.getLogger(INFO_LOGGER).info(
            f"Number of documents: {n_docs} | Number of classes: {len(classes)}"
        )

        # Get the memory buffer value from core variables
        core_memory_buffer_value_gb = get_core_setting(
            "TEXTA_EVALUATOR_MEMORY_BUFFER_GB")

        # Calculate the value based on given ratio if the core variable is empty
        memory_buffer_gb = calculate_memory_buffer(
            memory_buffer=core_memory_buffer_value_gb,
            ratio=EVALUATOR_MEMORY_BUFFER_RATIO,
            unit="gb")

        required_memory = get_memory_imprint(
            n_docs=n_docs,
            n_classes=len(classes),
            eval_type=evaluator_object.evaluation_type,
            unit="gb",
            int_size=64)
        enough_memory = is_enough_memory_available(
            required_memory=required_memory,
            memory_buffer=memory_buffer_gb,
            unit="gb")

        # Enable scoring after each scroll if there isn't enough memory
        # for calculating the scores for the whole set of documents at once.
        score_after_scroll = False if enough_memory else True

        # If scoring after each scroll is enabled and scores are averaged after each scroll
        # the results for each averaging function besides `micro` are imprecise
        scores_imprecise = True if (score_after_scroll
                                    and average != "micro") else False

        # Store document counts, labels' class counts and indicatior if scores are imprecise
        evaluator_object.document_count = n_docs
        evaluator_object.n_true_classes = len(true_set)
        evaluator_object.n_predicted_classes = len(pred_set)
        evaluator_object.n_total_classes = n_total_classes
        evaluator_object.scores_imprecise = scores_imprecise
        evaluator_object.score_after_scroll = score_after_scroll

        # Save model updates
        evaluator_object.save()

        logging.getLogger(INFO_LOGGER).info(
            f"Enough available memory: {enough_memory} | Score after scroll: {score_after_scroll}"
        )

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        # Scroll and score tags
        scores, bin_scores = scroll_and_score(
            generator=searcher,
            evaluator_object=evaluator_object,
            true_fact=true_fact,
            pred_fact=pred_fact,
            true_fact_value=true_fact_value,
            pred_fact_value=pred_fact_value,
            classes=classes,
            average=average,
            score_after_scroll=score_after_scroll,
            n_batches=n_batches,
            add_individual_results=add_individual_results)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        confusion = scores["confusion_matrix"]
        confusion = np.asarray(confusion, dtype="int64")

        if len(classes) <= choices.DEFAULT_MAX_CONFUSION_CLASSES:
            # Delete empty rows and columns corresponding to missing pred/true labels from the confusion matrix
            confusion, classes = delete_empty_rows_and_cols(confusion, classes)

        scores["confusion_matrix"] = confusion.tolist()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        # Add final scores to the model
        evaluator_object.precision = scores["precision"]
        evaluator_object.recall = scores["recall"]
        evaluator_object.f1_score = scores["f1_score"]
        evaluator_object.accuracy = scores["accuracy"]
        evaluator_object.confusion_matrix = json.dumps(
            scores["confusion_matrix"])

        evaluator_object.individual_results = json.dumps(
            remove_not_found(bin_scores), ensure_ascii=False)
        evaluator_object.add_misclassified_examples = False

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)
Esempio n. 6
0
def reindex_task(reindexer_task_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'reindex' with ID {reindexer_task_id}.")
    try:
        reindexer_obj = Reindexer.objects.get(pk=reindexer_task_id)
        task_object = reindexer_obj.task
        indices = json.loads(reindexer_obj.indices)
        fields = json.loads(reindexer_obj.fields)
        random_size = reindexer_obj.random_size
        field_type = json.loads(reindexer_obj.field_type)
        scroll_size = reindexer_obj.scroll_size
        new_index = reindexer_obj.new_index
        query = json.loads(reindexer_obj.query)

        # if no fields, let's use all fields from all selected indices
        if not fields:
            fields = ElasticCore().get_fields(indices)
            fields = [field["path"] for field in fields]

        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step("scrolling data")
        show_progress.update_view(0)

        elastic_search = ElasticSearcher(indices=indices,
                                         field_data=fields,
                                         callback_progress=show_progress,
                                         query=query,
                                         scroll_size=scroll_size)
        task_object.set_total(elastic_search.count())
        elastic_doc = ElasticDocument(new_index)

        if random_size > 0:
            elastic_search = elastic_search.random_documents(size=random_size)

        logging.getLogger(INFO_LOGGER).info("Updating index schema.")
        ''' the operations that don't require a mapping update have been completed '''
        schema_input = update_field_types(indices,
                                          fields,
                                          field_type,
                                          flatten_doc=FLATTEN_DOC)
        updated_schema = update_mapping(schema_input,
                                        new_index,
                                        reindexer_obj.add_facts_mapping,
                                        add_texta_meta_mapping=False)

        logging.getLogger(INFO_LOGGER).info("Creating new index.")
        # create new_index
        create_index_res = ElasticCore().create_index(new_index,
                                                      updated_schema)
        Index.objects.get_or_create(name=new_index)

        logging.getLogger(INFO_LOGGER).info("Indexing documents.")
        # set new_index name as mapping name, perhaps make it customizable in the future
        bulk_add_documents(elastic_search,
                           elastic_doc,
                           index=new_index,
                           chunk_size=scroll_size,
                           flatten_doc=FLATTEN_DOC,
                           field_data=field_type)

        # declare the job done
        task_object.complete()

        logging.getLogger(INFO_LOGGER).info(
            "Reindexing succesfully completed.")
        return True

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e