Exemple #1
0
def start_fact_delete_query_task(self, worker_id: int):
    """
    Scrolls the document ID-s and passes them to MLP worker.
    """
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Celery: Starting task for deleting facts by query for project with ID: {worker_object.pk}"
        )

        # init progress
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step('Scrolling document IDs')
        show_progress.update_view(0)

        # create searcher object for scrolling ids
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=worker_object.get_indices(),
                                   output=ElasticSearcher.OUT_DOC,
                                   callback_progress=show_progress,
                                   scroll_size=worker_object.scroll_size,
                                   field_data=["texta_facts"])

        count = searcher.count()

        show_progress.update_step(f'Deleting facts from {count} documents')
        show_progress.update_view(0)
        worker_object.task.set_total(count)
        return True

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Exemple #2
0
def fact_delete_query_task(self, worker_id: int):
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step(
            'Scrolling through the indices to delete the facts.')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        target_facts = json.loads(worker_object.facts)
        scroll_size = worker_object.scroll_size

        searcher = ElasticSearcher(
            query=json.loads(worker_object.query),
            indices=indices,
            field_data=[TEXTA_TAGS_KEY],
            output=ElasticSearcher.OUT_RAW,
            callback_progress=show_progress,
            scroll_size=scroll_size,
            scroll_timeout=f"{worker_object.es_timeout}m")

        ed = ElasticDocument(index=None)
        actions = query_delete_actions_generator(searcher, target_facts)
        ed.bulk_update(actions)

        worker_object.task.complete()
        worker_object.save()

        return worker_id

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Exemple #3
0
def apply_search_fields_tagger_on_index(object_id: int):
    search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id)
    task_object = search_fields_tagger.task
    """Apply Search Fields Tagger to index."""
    try:
        progress = ShowProgress(task_object)
        progress.update_step('scrolling search fields')

        # Get the necessary fields.
        indices: List[str] = search_fields_tagger.get_indices()
        fields: List[str] = json.loads(search_fields_tagger.fields)
        fact_name: str = search_fields_tagger.fact_name
        scroll_timeout = search_fields_tagger.es_timeout
        scroll_size = search_fields_tagger.bulk_size

        use_breakup = search_fields_tagger.use_breakup
        breakup_character = search_fields_tagger.breakup_character

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=json.loads(search_fields_tagger.query),
            output=ElasticSearcher.OUT_RAW,
            scroll_timeout=f"{scroll_timeout}m",
            callback_progress=progress,
            scroll_size=scroll_size)

        actions = update_search_fields_generator(
            generator=searcher,
            ec=ec,
            fields=fields,
            fact_name=fact_name,
            search_field_tagger_object=search_fields_tagger,
            use_breakup=use_breakup,
            breakup_character=breakup_character)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return object_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemple #4
0
def apply_analyzers_on_indices(self, worker_id: int):
    worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id)
    task_object = worker_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        fields = json.loads(worker_object.fields)
        detect_lang = worker_object.detect_lang
        snowball_language = worker_object.stemmer_lang
        scroll_timeout = f"{worker_object.es_timeout}m"
        scroll_size = worker_object.bulk_size
        analyzers = json.loads(worker_object.analyzers)
        tokenizer = worker_object.tokenizer
        strip_html = worker_object.strip_html

        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=fields,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout=scroll_timeout)

        task_object.set_total(searcher.count())

        actions = process_analyzer_actions(generator=searcher,
                                           worker=worker_object,
                                           detect_lang=detect_lang,
                                           snowball_language=snowball_language,
                                           fields_to_parse=fields,
                                           analyzers=analyzers,
                                           tokenizer=tokenizer,
                                           strip_html=strip_html)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        ed.bulk_update(actions=actions, chunk_size=scroll_size)

        worker_object.task.complete()

        return worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemple #5
0
def start_mlp_worker(self, mlp_id: int):
    """
    Scrolls the document ID-s and passes them to MLP worker.
    """
    mlp_object = MLPWorker.objects.get(pk=mlp_id)

    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Applying mlp on the index for MLP Task ID: {mlp_id}")
        # init progress
        show_progress = ShowProgress(mlp_object.task, multiplier=1)
        show_progress.update_step('Scrolling document IDs')
        show_progress.update_view(0)
        # Get the necessary fields.
        indices: List[str] = mlp_object.get_indices()
        es_scroll_size = mlp_object.es_scroll_size
        es_timeout = mlp_object.es_timeout

        # create searcher object for scrolling ids
        searcher = ElasticSearcher(query=json.loads(mlp_object.query),
                                   indices=indices,
                                   output=ElasticSearcher.OUT_META,
                                   callback_progress=show_progress,
                                   scroll_size=es_scroll_size,
                                   scroll_timeout=f"{es_timeout}m")
        # add texta facts mappings to the indices if needed
        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        doc_chunks = list(chunks_iter(searcher, MLP_BATCH_SIZE))

        # update progress
        show_progress.update_step(
            f'Applying MLP to {len(doc_chunks)} documents')
        show_progress.update_view(0)

        mlp_object.task.set_total(searcher.count())
        mlp_object.task.update_status(Task.STATUS_RUNNING)

        # pass document id-s to the next task
        chain = group(
            apply_mlp_on_es_docs.s([doc["_id"] for doc in meta_chunk], mlp_id)
            for meta_chunk in doc_chunks) | end_mlp_task.si(mlp_id)
        chain.delay()
        return True

    except Exception as e:
        mlp_object.task.handle_failed_task(e)
        raise
Exemple #6
0
def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int):
    """Apply Torch Tagger to index."""
    try:
        tagger_object = TorchTaggerObject.objects.get(pk=object_id)
        tagger = tagger_object.load_tagger()

        progress = ShowProgress(tagger_object.task)

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices = indices,
            field_data = fields + ["texta_facts"],  # Get facts to add upon existing ones.
            query = query,
            output = ElasticSearcher.OUT_RAW,
            timeout = f"{es_timeout}m",
            callback_progress=progress,
            scroll_size = bulk_size
        )

        actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger)
        for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3):
            if not success:
                logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))

        tagger_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        tagger_object.task.add_error(error_message)
        tagger_object.task.update_status(Task.STATUS_FAILED)
Exemple #7
0
def apply_lang_on_indices(self, apply_worker_id: int):
    worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id)
    task_object = worker_object.task
    try:
        load_mlp()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        field = worker_object.field

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=[field],
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="15m")

        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        actions = process_lang_actions(generator=searcher,
                                       field=field,
                                       worker_id=apply_worker_id,
                                       mlp_class=mlp)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        worker_object.task.complete()

        return apply_worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemple #8
0
def start_rakun_task(self, object_id: int):
    rakun = RakunExtractor.objects.get(pk=object_id)
    task_object = rakun.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('starting rakun')
    show_progress.update_view(0)
    return object_id
Exemple #9
0
def start_tagger_task(tagger_id: int):
    tagger = Tagger.objects.get(pk=tagger_id)
    task_object = tagger.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('starting tagging')
    show_progress.update_view(0)
    return tagger_id
Exemple #10
0
def apply_summarizer_on_index(self, summarizer_id: int):
    summarizer_object = Summarizer.objects.get(pk=summarizer_id)
    task_object = summarizer_object.task
    try:
        load_sumy()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling summarizer')

        # Get the necessary fields.
        indices: List[str] = summarizer_object.get_indices()
        field_data: List[str] = json.loads(summarizer_object.fields)
        ratio_data: float[str] = summarizer_object.ratio
        algorithm_data: List[str] = summarizer_object.algorithm

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(summarizer_object.query),
                                   indices=indices,
                                   field_data=field_data,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="30m")

        actions = process_actions(searcher,
                                  field_data,
                                  ratio_data,
                                  algorithm=algorithm_data,
                                  summarizer_class=sumy,
                                  summarizer_id=summarizer_id)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return summarizer_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemple #11
0
def start_clustering_task(clustering_id: int):
    clustering_obj = ClusteringResult.objects.get(pk=clustering_id)
    task_object = clustering_obj.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('starting clustering')
    show_progress.update_view(0)

    return clustering_id
Exemple #12
0
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str],
                                   fields: List[str], query: dict,
                                   es_timeout: int, bulk_size: int,
                                   fact_name: str, add_spans: bool):
    """Apply Rakun Keyword Extractor to index."""
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!"
    )
    rakun_extractor_object = RakunExtractor.objects.get(id=object_id)
    try:
        progress = ShowProgress(rakun_extractor_object.task)

        # retrieve fields
        field_data = fields

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=field_data +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            timeout=f"{es_timeout}m",
            output=ElasticSearcher.OUT_RAW,
            callback_progress=progress,
            scroll_size=bulk_size)
        keyword_detector = rakun_extractor_object.load_rakun_keyword_detector()
        actions = update_generator(
            keyword_detector=keyword_detector,
            generator=searcher,
            ec=ec,
            fields=field_data,
            rakun_extractor_object=rakun_extractor_object,
            fact_name=fact_name,
            fact_value="",
            add_spans=add_spans)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        rakun_extractor_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        rakun_extractor_object.task.add_error(error_message)
        rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
Exemple #13
0
def start_summarizer_worker(self, summarizer_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting applying summarizer on the index for model ID: {summarizer_id}"
    )
    summarizer_object = Summarizer.objects.get(pk=summarizer_id)
    show_progress = ShowProgress(summarizer_object.task, multiplier=1)
    show_progress.update_step('running summarizer')
    show_progress.update_view(0)
    return summarizer_id
Exemple #14
0
def start_search_fields_tagger_worker(self, object_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting applying search fields tagger on the index for model ID: {object_id}"
    )
    searchfieldstagger_object = SearchFieldsTagger.objects.get(pk=object_id)
    show_progress = ShowProgress(searchfieldstagger_object.task, multiplier=1)
    show_progress.update_step('running search fields tagger')
    show_progress.update_view(0)
    return object_id
Exemple #15
0
def start_crf_task(crf_id: int):
    """
    Starts the training process for Extractor.
    """
    extractor = CRFExtractorObject.objects.get(pk=crf_id)
    task_object = extractor.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('starting tagging')
    show_progress.update_view(0)
    return crf_id
Exemple #16
0
def apply_crf_extractor_to_index(object_id: int, indices: List[str],
                                 mlp_fields: List[str], label_suffix: str,
                                 query: dict, bulk_size: int,
                                 max_chunk_bytes: int, es_timeout: int):
    """
    Applies Extractor to ES index.
    """
    try:
        # load model
        crf_object = CRFExtractorObject.objects.get(pk=object_id)
        extractor = crf_object.load_extractor()
        # progress
        progress = ShowProgress(crf_object.task)
        # add fact field if missing
        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]
        # search
        searcher = ElasticSearcher(
            indices=indices,
            field_data=mlp_fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            output=ElasticSearcher.OUT_RAW,
            timeout=f"{es_timeout}m",
            callback_progress=progress,
            scroll_size=bulk_size)
        # create update actions
        actions = update_generator(generator=searcher,
                                   ec=ec,
                                   mlp_fields=mlp_fields,
                                   label_suffix=label_suffix,
                                   object_id=object_id,
                                   extractor=extractor)
        # perform updates
        try:
            # as we have defined indices in actions there is no need to do it again (None)
            ElasticDocument(None).bulk_update(actions)
        except Exception as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
        # all done
        crf_object.task.complete()
        return True

    except Exception as e:
        crf_object.task.handle_failed_task(e)
        raise e
Exemple #17
0
def save_tagger_results(result_data: dict):
    try:
        tagger_id = result_data['id']
        logging.getLogger(INFO_LOGGER).info(
            f"Starting task results for tagger with ID: {tagger_id}!")
        tagger_object = Tagger.objects.get(pk=tagger_id)

        # Handle previous tagger models that exist in case of retrains.
        model_path = pathlib.Path(
            tagger_object.model.path) if tagger_object.model else None

        task_object = tagger_object.task
        show_progress = ShowProgress(task_object, multiplier=1)
        # update status to saving
        show_progress.update_step('saving')
        show_progress.update_view(0)
        tagger_object.model.name = result_data["tagger_path"]
        tagger_object.precision = result_data["precision"]
        tagger_object.recall = result_data["recall"]
        tagger_object.f1_score = result_data["f1_score"]
        tagger_object.num_features = result_data["num_features"]
        tagger_object.num_examples = json.dumps(result_data["num_examples"])
        tagger_object.model_size = result_data["model_size"]
        tagger_object.plot.name = result_data["plot"]
        tagger_object.confusion_matrix = result_data["confusion_matrix"]
        tagger_object.classes = json.dumps(result_data["classes"],
                                           ensure_ascii=False)
        tagger_object.save()
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemple #18
0
def save_crf_results(result_data: dict):
    """
    Saves task results to database.
    """
    try:
        crf_id = result_data['id']
        logging.getLogger(INFO_LOGGER).info(
            f"Starting task results for CRFExtractor with ID: {crf_id}!")
        crf_object = CRFExtractorObject.objects.get(pk=crf_id)

        model_path = pathlib.Path(
            crf_object.model.path) if crf_object.model else None

        task_object = crf_object.task
        show_progress = ShowProgress(task_object, multiplier=1)
        # update status to saving
        show_progress.update_step('saving')
        show_progress.update_view(0)
        crf_object.best_c1 = result_data["best_c_values"][0]
        crf_object.best_c2 = result_data["best_c_values"][1]
        crf_object.model.name = result_data["extractor_path"]
        crf_object.precision = result_data["precision"]
        crf_object.recall = result_data["recall"]
        crf_object.f1_score = result_data["f1_score"]
        crf_object.model_size = result_data["model_size"]
        crf_object.confusion_matrix = result_data["confusion_matrix"]
        crf_object.plot.name = result_data["plot"]
        crf_object.save()
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

        return True
    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemple #19
0
def evaluate_entity_tags_task(object_id: int,
                              indices: List[str],
                              query: dict,
                              es_timeout: int = 10,
                              scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting entity evaluator task for Evaluator with ID {object_id}."
        )

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact

        add_misclassified_examples = evaluator_object.add_misclassified_examples
        token_based = evaluator_object.token_based

        # If the user hasn't defined a field, retrieve it automatically
        if not evaluator_object.field:
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))
            true_fact_doc_paths = es_aggregator.facts_abstract(
                key_field="fact",
                value_field="doc_path",
                filter_by_key=true_fact)
            doc_path = true_fact_doc_paths[0]
        else:
            doc_path = evaluator_object.field

        searcher = ElasticSearcher(indices=indices,
                                   field_data=[doc_path, "texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Get number of documents
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        evaluator_object.document_count = n_docs
        evaluator_object.scores_imprecise = False
        evaluator_object.score_after_scroll = False
        evaluator_object.add_individual_results = False

        # Save model updates
        evaluator_object.save()

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        scores, misclassified = scroll_and_score_entity(
            searcher, evaluator_object, true_fact, pred_fact, doc_path,
            token_based, n_batches, add_misclassified_examples)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        classes = ["other", true_fact]
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)
Exemple #20
0
def reindex_task(reindexer_task_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'reindex' with ID {reindexer_task_id}.")
    try:
        reindexer_obj = Reindexer.objects.get(pk=reindexer_task_id)
        task_object = reindexer_obj.task
        indices = json.loads(reindexer_obj.indices)
        fields = json.loads(reindexer_obj.fields)
        random_size = reindexer_obj.random_size
        field_type = json.loads(reindexer_obj.field_type)
        scroll_size = reindexer_obj.scroll_size
        new_index = reindexer_obj.new_index
        query = json.loads(reindexer_obj.query)

        # if no fields, let's use all fields from all selected indices
        if not fields:
            fields = ElasticCore().get_fields(indices)
            fields = [field["path"] for field in fields]

        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step("scrolling data")
        show_progress.update_view(0)

        elastic_search = ElasticSearcher(indices=indices,
                                         field_data=fields,
                                         callback_progress=show_progress,
                                         query=query,
                                         scroll_size=scroll_size)
        task_object.set_total(elastic_search.count())
        elastic_doc = ElasticDocument(new_index)

        if random_size > 0:
            elastic_search = elastic_search.random_documents(size=random_size)

        logging.getLogger(INFO_LOGGER).info("Updating index schema.")
        ''' the operations that don't require a mapping update have been completed '''
        schema_input = update_field_types(indices,
                                          fields,
                                          field_type,
                                          flatten_doc=FLATTEN_DOC)
        updated_schema = update_mapping(schema_input,
                                        new_index,
                                        reindexer_obj.add_facts_mapping,
                                        add_texta_meta_mapping=False)

        logging.getLogger(INFO_LOGGER).info("Creating new index.")
        # create new_index
        create_index_res = ElasticCore().create_index(new_index,
                                                      updated_schema)
        Index.objects.get_or_create(name=new_index)

        logging.getLogger(INFO_LOGGER).info("Indexing documents.")
        # set new_index name as mapping name, perhaps make it customizable in the future
        bulk_add_documents(elastic_search,
                           elastic_doc,
                           index=new_index,
                           chunk_size=scroll_size,
                           flatten_doc=FLATTEN_DOC,
                           field_data=field_type)

        # declare the job done
        task_object.complete()

        logging.getLogger(INFO_LOGGER).info(
            "Reindexing succesfully completed.")
        return True

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemple #21
0
def evaluate_tags_task(object_id: int,
                       indices: List[str],
                       query: dict,
                       es_timeout: int = 10,
                       scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting evaluator task for Evaluator with ID {object_id}.")

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        # Retreieve facts and sklearn average function from the model
        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact
        true_fact_value = evaluator_object.true_fact_value
        pred_fact_value = evaluator_object.predicted_fact_value

        average = evaluator_object.average_function
        add_individual_results = evaluator_object.add_individual_results

        searcher = ElasticSearcher(indices=indices,
                                   field_data=["texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Binary
        if true_fact_value and pred_fact_value:
            logging.getLogger(INFO_LOGGER).info(
                f"Starting binary evaluation. Comparing following fact and fact value pairs: TRUE: ({true_fact}: {true_fact_value}), PREDICTED: ({pred_fact}: {pred_fact_value})."
            )

            # Set the evaluation type in the model
            evaluator_object.evaluation_type = "binary"

            true_set = {true_fact_value, "other"}
            pred_set = {pred_fact_value, "other"}

            classes = ["other", true_fact_value]
            n_total_classes = len(classes)

        # Multilabel/multiclass
        else:
            logging.getLogger(INFO_LOGGER).info(
                f"Starting multilabel evaluation. Comparing facts TRUE: '{true_fact}', PRED: '{pred_fact}'."
            )

            # Make deepcopy of the query to avoid modifying Searcher's query.
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))

            # Get all fact values corresponding to true and predicted facts to construct total set of labels
            # needed for confusion matrix, individual score calculations and memory imprint calculations
            true_fact_values = es_aggregator.facts(
                size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                filter_by_fact_name=true_fact)
            pred_fact_values = es_aggregator.facts(
                size=choices.DEFAULT_MAX_AGGREGATION_SIZE,
                filter_by_fact_name=pred_fact)

            true_set = set(true_fact_values)
            pred_set = set(pred_fact_values)

            classes = list(true_set.union(pred_set))
            n_total_classes = len(classes)

            # Add dummy classes for missing labels
            classes.extend(
                [choices.MISSING_TRUE_LABEL, choices.MISSING_PRED_LABEL])

            ## Set the evaluation type in the model
            evaluator_object.evaluation_type = "multilabel"

            classes.sort(key=lambda x: x[0].lower())

        # Get number of documents in the query to estimate memory imprint
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        logging.getLogger(INFO_LOGGER).info(
            f"Number of documents: {n_docs} | Number of classes: {len(classes)}"
        )

        # Get the memory buffer value from core variables
        core_memory_buffer_value_gb = get_core_setting(
            "TEXTA_EVALUATOR_MEMORY_BUFFER_GB")

        # Calculate the value based on given ratio if the core variable is empty
        memory_buffer_gb = calculate_memory_buffer(
            memory_buffer=core_memory_buffer_value_gb,
            ratio=EVALUATOR_MEMORY_BUFFER_RATIO,
            unit="gb")

        required_memory = get_memory_imprint(
            n_docs=n_docs,
            n_classes=len(classes),
            eval_type=evaluator_object.evaluation_type,
            unit="gb",
            int_size=64)
        enough_memory = is_enough_memory_available(
            required_memory=required_memory,
            memory_buffer=memory_buffer_gb,
            unit="gb")

        # Enable scoring after each scroll if there isn't enough memory
        # for calculating the scores for the whole set of documents at once.
        score_after_scroll = False if enough_memory else True

        # If scoring after each scroll is enabled and scores are averaged after each scroll
        # the results for each averaging function besides `micro` are imprecise
        scores_imprecise = True if (score_after_scroll
                                    and average != "micro") else False

        # Store document counts, labels' class counts and indicatior if scores are imprecise
        evaluator_object.document_count = n_docs
        evaluator_object.n_true_classes = len(true_set)
        evaluator_object.n_predicted_classes = len(pred_set)
        evaluator_object.n_total_classes = n_total_classes
        evaluator_object.scores_imprecise = scores_imprecise
        evaluator_object.score_after_scroll = score_after_scroll

        # Save model updates
        evaluator_object.save()

        logging.getLogger(INFO_LOGGER).info(
            f"Enough available memory: {enough_memory} | Score after scroll: {score_after_scroll}"
        )

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        # Scroll and score tags
        scores, bin_scores = scroll_and_score(
            generator=searcher,
            evaluator_object=evaluator_object,
            true_fact=true_fact,
            pred_fact=pred_fact,
            true_fact_value=true_fact_value,
            pred_fact_value=pred_fact_value,
            classes=classes,
            average=average,
            score_after_scroll=score_after_scroll,
            n_batches=n_batches,
            add_individual_results=add_individual_results)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        confusion = scores["confusion_matrix"]
        confusion = np.asarray(confusion, dtype="int64")

        if len(classes) <= choices.DEFAULT_MAX_CONFUSION_CLASSES:
            # Delete empty rows and columns corresponding to missing pred/true labels from the confusion matrix
            confusion, classes = delete_empty_rows_and_cols(confusion, classes)

        scores["confusion_matrix"] = confusion.tolist()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        # Add final scores to the model
        evaluator_object.precision = scores["precision"]
        evaluator_object.recall = scores["recall"]
        evaluator_object.f1_score = scores["f1_score"]
        evaluator_object.accuracy = scores["accuracy"]
        evaluator_object.confusion_matrix = json.dumps(
            scores["confusion_matrix"])

        evaluator_object.individual_results = json.dumps(
            remove_not_found(bin_scores), ensure_ascii=False)
        evaluator_object.add_misclassified_examples = False

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)
Exemple #22
0
def train_crf_task(crf_id: int):
    """
    Trains CRF model.
    """
    try:
        # get task object
        logging.getLogger(INFO_LOGGER).info(
            f"Starting task 'train_crf' for CRFExtractor with ID: {crf_id}!")
        crf_object = CRFExtractorObject.objects.get(id=crf_id)
        task_object = crf_object.task
        # create progress object
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling documents')
        show_progress.update_view(0)
        # retrieve indices & field data
        indices = get_indices_from_object(crf_object)
        mlp_field = crf_object.mlp_field

        # load embedding if any
        if crf_object.embedding:
            embedding = crf_object.embedding.get_embedding()
            embedding.load_django(crf_object.embedding)
        else:
            embedding = None

        # scroll docs
        logging.getLogger(INFO_LOGGER).info(
            f"Scrolling data for CRFExtractor with ID: {crf_id}!")
        documents = ElasticSearcher(query=crf_object.get_query(),
                                    indices=indices,
                                    callback_progress=show_progress,
                                    text_processor=None,
                                    field_data=[mlp_field, "texta_facts"],
                                    output=ElasticSearcher.OUT_DOC,
                                    flatten=False)

        # create config
        config = crf_object.get_crf_config()
        # start training
        logging.getLogger(INFO_LOGGER).info(
            f"Training the model for CRFExtractor with ID: {crf_id}!")
        # create extractor
        extractor = CRFExtractor(config=config, embedding=embedding)
        # train the CRF model
        model_full_path, relative_model_path = crf_object.generate_name("crf")
        report, _ = extractor.train(documents,
                                    save_path=model_full_path,
                                    mlp_field=mlp_field)
        # Save the image before its path.
        image_name = f'{secrets.token_hex(15)}.png'
        crf_object.plot.save(image_name,
                             create_tagger_plot(report.to_dict()),
                             save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        # pass results to next task
        return {
            "id":
            crf_id,
            "best_c_values":
            extractor.best_c_values,
            "extractor_path":
            relative_model_path,
            "precision":
            float(report.precision),
            "recall":
            float(report.recall),
            "f1_score":
            float(report.f1_score),
            "confusion_matrix":
            report.confusion.tolist(),
            "model_size":
            round(float(os.path.getsize(model_full_path)) / 1000000,
                  1),  # bytes to mb
            "plot":
            str(image_path),
        }
    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Exemple #23
0
def perform_data_clustering(clustering_id):
    clustering_model = ClusteringResult.objects.get(id=clustering_id)

    try:

        num_clusters = clustering_model.num_cluster
        clustering_algorithm = clustering_model.clustering_algorithm
        stop_words = json.loads(clustering_model.stop_words)
        indices = clustering_model.get_indices()
        query = json.loads(clustering_model.query)
        ignored_ids = json.loads(clustering_model.ignored_ids)
        fields = json.loads(clustering_model.fields)
        display_fields = json.loads(clustering_model.display_fields)
        document_limit = clustering_model.document_limit
        vectorizer = clustering_model.vectorizer
        num_dims = clustering_model.num_dims
        use_lsi = clustering_model.use_lsi
        num_topics = clustering_model.num_topics
        significant_words_filter = clustering_model.significant_words_filter

        # Removing stopwords, ignored ids while fetching the documents.
        show_progress = ShowProgress(clustering_model.task, multiplier=1)
        show_progress.update_step("scrolling data")
        show_progress.update_view(0)

        # load phraser from embedding
        if clustering_model.embedding:
            embedding = clustering_model.embedding.get_embedding()
            embedding.load_django(clustering_model.embedding)
            phraser = embedding.phraser
        else:
            phraser = None

        # Can't give parser to TextProcessor as some processing is also done in Clustering class
        text_processor = TextProcessor(remove_stop_words=True,
                                       custom_stop_words=stop_words)

        elastic_search = ElasticSearcher(
            indices=indices,
            query=query,
            callback_progress=show_progress,
            text_processor=text_processor,
            ignore_ids=set(ignored_ids),
            output=ElasticSearcher.OUT_TEXT_WITH_ID,
            field_data=fields,
            scroll_limit=document_limit)

        docs = [{
            "id": doc_id,
            "document": document
        } for doc_id, document in elastic_search]

        # Group em up!
        clusters = Clustering(docs=docs,
                              num_clusters=num_clusters,
                              stop_words=stop_words,
                              clustering_algorithm=clustering_algorithm,
                              vectorizer=vectorizer,
                              num_dims=num_dims,
                              use_lsi=use_lsi,
                              num_topics=num_topics,
                              phraser=phraser)
        clusters.cluster()

        # Save the vector path.
        full_vector_path, relative_vector_path = clustering_model.generate_name(
        )
        clusters.save_transformation(full_vector_path)

        clustering_info = {
            "pk": clustering_model.pk,
            "results": list(clusters.clustering_result.items()),
            "fields": fields,
            "indices": indices,
            "display_fields": display_fields,
            "vectors_filepath": relative_vector_path,
            "stop_words": stop_words,
            "significant_words_filter": significant_words_filter
        }

        return clustering_info

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        clustering_model.task.add_error(str(e))
        clustering_model.task.update_status(status=Task.STATUS_FAILED)
        clustering_model.save()
        raise e
Exemple #24
0
def train_torchtagger(tagger_id, testing=False):
    try:
        # retrieve neurotagger & task objects
        tagger_object = TorchTaggerObject.objects.get(pk=tagger_id)

        # Handle previous tagger models that exist in case of retrains.
        model_path = pathlib.Path(tagger_object.model.path) if tagger_object.model else None

        task_object = tagger_object.task
        model_type = TorchTaggerObject.MODEL_TYPE
        show_progress = ShowProgress(task_object, multiplier=1)
        # get fields & indices
        fields = json.loads(tagger_object.fields)
        indices = get_indices_from_object(tagger_object)
        # load embedding
        embedding = W2VEmbedding()
        embedding.load_django(tagger_object.embedding)
        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices,
            fields,
            show_progress=show_progress,
            join_fields=True,
            balance=tagger_object.balance,
            use_sentence_shuffle=tagger_object.use_sentence_shuffle,
            balance_to_max_limit=tagger_object.balance_to_max_limit
        )
        show_progress.update_step('training')
        show_progress.update_view(0.0)

        # get num examples and save to model
        num_examples = {k: len(v) for k, v in data_sample.data.items()}
        tagger_object.num_examples = json.dumps(num_examples)

        tagger_object.save()

        # create TorchTagger
        tagger = TorchTagger(
            embedding,
            model_arch=tagger_object.model_architecture
        )
        # train tagger and get result statistics
        report = tagger.train(data_sample.data, num_epochs=int(tagger_object.num_epochs), pos_label=tagger_object.pos_label)
        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()
        # save tagger to disk
        tagger_path = os.path.join(RELATIVE_MODELS_PATH, model_type, f'{model_type}_{tagger_id}_{secrets.token_hex(10)}')
        tagger.save(tagger_path)


        # set tagger location
        tagger_object.model.name = tagger_path
        # save tagger plot
        report_dict = report.to_dict()
        tagger_object.plot.save(f'{secrets.token_hex(15)}.png', create_tagger_plot(report_dict), save=False)
        # save label index
        tagger_object.label_index = json.dumps(tagger.label_reverse_index)
        # stats to model object
        tagger_object.f1_score = report.f1_score
        tagger_object.precision = report.precision
        tagger_object.recall = report.recall
        tagger_object.accuracy = report.accuracy
        tagger_object.training_loss = report.training_loss
        tagger_object.epoch_reports = json.dumps([a.to_dict() for a in tagger.epoch_reports])
        tagger_object.confusion_matrix = json.dumps(report.confusion.tolist())
        tagger_object.classes = json.dumps(report.classes, ensure_ascii=False)

        # save tagger object
        tagger_object.save()
        # declare the job done
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

        return True


    except Exception as e:
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise
Exemple #25
0
def train_bert_tagger(tagger_id, testing=False):
    # retrieve neurotagger & task objects
    tagger_object = BertTaggerObject.objects.get(pk=tagger_id)

    # Handle previous tagger models that exist in case of retrains.
    model_path = pathlib.Path(
        tagger_object.model.path) if tagger_object.model else None

    task_object = tagger_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        # get fields & indices
        fields = json.loads(tagger_object.fields)
        indices = get_indices_from_object(tagger_object)

        # set loading model from a checkpoint False by default
        from_checkpoint = False
        checkpoint_model = tagger_object.checkpoint_model

        pos_label = tagger_object.pos_label

        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices,
            fields,
            show_progress=show_progress,
            join_fields=True,
            balance=tagger_object.balance,
            use_sentence_shuffle=tagger_object.use_sentence_shuffle,
            balance_to_max_limit=tagger_object.balance_to_max_limit)
        show_progress.update_step('training')
        show_progress.update_view(0.0)

        # select sklearn average function based on the number of classes
        if data_sample.is_binary:
            sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_BINARY
        else:
            sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_MULTICLASS

        # if checkpoint model is detected, load it and use it for further training
        if checkpoint_model:
            logging.getLogger(INFO_LOGGER).info(
                f"Loading model from a checkpoint stored in '{tagger_object}'..."
            )

            # use the same pre-trained bert model as the checkpoint model
            tagger_object.bert_model = checkpoint_model.bert_model
            tagger = checkpoint_model.load_tagger()

            # set sklearn avg function in case the number of classes has changed
            tagger.sklearn_avg_function = sklearn_avg_function

            # set loading model from a checkpoint True
            from_checkpoint = True

        # if no checkpoint model is given, train a new model
        else:
            logging.getLogger(INFO_LOGGER).info(
                "No checkpoint model detected, training a new model...")
            # NB! saving pretrained models must be disabled!
            tagger = BertTagger(
                allow_standard_output=choices.DEFAULT_ALLOW_STANDARD_OUTPUT,
                autoadjust_batch_size=choices.DEFAULT_AUTOADJUST_BATCH_SIZE,
                sklearn_avg_function=sklearn_avg_function,
                use_gpu=tagger_object.use_gpu,
                save_pretrained=False,
                pretrained_models_dir=BERT_PRETRAINED_MODEL_DIRECTORY,
                logger=logging.getLogger(INFO_LOGGER),
                cache_dir=BERT_CACHE_DIR)

        # use state dict for binary taggers
        if data_sample.is_binary:
            tagger.config.use_state_dict = True
        else:
            tagger.config.use_state_dict = False
            pos_label = ""

        # train tagger and get result statistics
        report = tagger.train(data_sample.data,
                              from_checkpoint=from_checkpoint,
                              pos_label=pos_label,
                              n_epochs=tagger_object.num_epochs,
                              max_length=tagger_object.max_length,
                              batch_size=tagger_object.batch_size,
                              lr=tagger_object.learning_rate,
                              eps=tagger_object.eps,
                              split_ratio=tagger_object.split_ratio,
                              bert_model=tagger_object.bert_model)
        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # save tagger to disc
        tagger_path = os.path.join(
            BERT_FINETUNED_MODEL_DIRECTORY,
            f'{tagger_object.MODEL_TYPE}_{tagger_id}_{secrets.token_hex(10)}')
        tagger.save(tagger_path)

        # set tagger location
        tagger_object.model.name = tagger_path

        report_dict = report.to_dict()

        # save tagger plot
        tagger_object.plot.save(f'{secrets.token_hex(15)}.png',
                                create_tagger_plot(report_dict),
                                save=False)
        # save label index
        tagger_object.label_index = json.dumps(
            tagger.config.label_reverse_index)
        # stats to model object
        tagger_object.f1_score = report.f1_score
        tagger_object.precision = report.precision
        tagger_object.recall = report.recall
        tagger_object.accuracy = report.accuracy
        tagger_object.training_loss = report.training_loss
        tagger_object.validation_loss = report.validation_loss
        tagger_object.epoch_reports = json.dumps(
            [a.to_dict() for a in tagger.epoch_reports])
        tagger_object.num_examples = json.dumps(
            {k: len(v)
             for k, v in list(data_sample.data.items())})
        tagger_object.adjusted_batch_size = tagger.config.batch_size
        tagger_object.confusion_matrix = json.dumps(report.confusion.tolist())
        tagger_object.classes = json.dumps(report.classes, ensure_ascii=False)
        # save tagger object
        tagger_object.save()
        # declare the job done
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        tagger_object.task.add_error(error_message)
        tagger_object.task.update_status(Task.STATUS_FAILED)
        raise
Exemple #26
0
def train_embedding(embedding_id):
    # retrieve embedding & task objects
    embedding_object = Embedding.objects.get(pk=embedding_id)
    task_object = embedding_object.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('training')
    show_progress.update_view(0)
    try:
        # retrieve indices from project
        indices = get_indices_from_object(embedding_object)
        field_data = json.loads(embedding_object.fields)
        max_documents = embedding_object.max_documents
        use_phraser = embedding_object.use_phraser
        snowball_language = embedding_object.snowball_language
        # add stemmer if asked
        if snowball_language:
            snowball_lemmatizer = ElasticAnalyzer(language=snowball_language)
        else:
            snowball_lemmatizer = None
        # iterator for texts
        sentences = ElasticSearcher(query=json.loads(embedding_object.query),
                                    indices=indices,
                                    field_data=field_data,
                                    callback_progress=show_progress,
                                    scroll_limit=max_documents,
                                    text_processor=TextProcessor(
                                        sentences=True,
                                        remove_stop_words=True,
                                        words_as_list=True,
                                        lemmatizer=snowball_lemmatizer),
                                    output=ElasticSearcher.OUT_TEXT)
        # create embedding object & train
        embedding = embedding_object.get_embedding()
        embedding.train(sentences, use_phraser=use_phraser)

        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # save model
        show_progress.update_step('saving')
        full_model_path, relative_model_path = embedding_object.generate_name(
            "embedding")
        embedding.save(full_model_path)

        # save gensim model
        if embedding_object.embedding_type == "FastTextEmbedding":
            fast_text_embedding_model = joblib.load(
                full_model_path)["embedding"]
            gensim_full_model_path = full_model_path + "_" + FACEBOOK_MODEL_SUFFIX
            gensim.models.fasttext.save_facebook_model(
                fast_text_embedding_model,
                gensim_full_model_path,
                encoding='utf-8')

        # save model path
        embedding_object.embedding_model.name = relative_model_path
        embedding_object.vocab_size = embedding.model.wv.vectors.shape[0]
        embedding_object.save()
        # declare the job done
        task_object.complete()
        return True
    except Exception as e:
        # declare the job failed
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise
Exemple #27
0
def train_tagger_task(tagger_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'train_tagger' for tagger with ID: {tagger_id}!")
    tagger_object = Tagger.objects.get(id=tagger_id)
    task_object = tagger_object.task
    try:
        # create progress object
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling positives')
        show_progress.update_view(0)

        # retrieve indices & field data
        indices = get_indices_from_object(tagger_object)
        field_data = json.loads(tagger_object.fields)
        # split stop words by space or newline and remove empties

        stop_words = load_stop_words(tagger_object.stop_words)
        ignore_numbers = tagger_object.ignore_numbers

        # get scoring function
        if tagger_object.scoring_function != "default":
            scoring_function = tagger_object.scoring_function
        else:
            scoring_function = None

        logging.getLogger(INFO_LOGGER).info(
            f"Using scoring function: {scoring_function}.")

        # load embedding if any
        if tagger_object.embedding:
            embedding = W2VEmbedding()
            embedding.load_django(tagger_object.embedding)
        else:
            embedding = None
        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices=indices,
            field_data=field_data,
            show_progress=show_progress,
            snowball_language=tagger_object.snowball_language,
            detect_lang=tagger_object.detect_lang,
            balance=tagger_object.balance,
            balance_to_max_limit=tagger_object.balance_to_max_limit)
        # update status to training
        show_progress.update_step("training")
        show_progress.update_view(0)
        # train model
        tagger = TextTagger(embedding=embedding,
                            custom_stop_words=stop_words,
                            ignore_numbers=ignore_numbers,
                            classifier=tagger_object.classifier,
                            vectorizer=tagger_object.vectorizer,
                            analyzer=tagger_object.analyzer)
        tagger.train(data_sample.data,
                     pos_label=tagger_object.pos_label,
                     field_list=field_data,
                     scoring=scoring_function)

        # save tagger to disk
        tagger_full_path, relative_tagger_path = tagger_object.generate_name(
            "tagger")
        tagger.save(tagger_full_path)

        # Save the image before its path.
        image_name = f'{secrets.token_hex(15)}.png'
        tagger_object.plot.save(image_name,
                                create_tagger_plot(tagger.report.to_dict()),
                                save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name

        # get num examples
        num_examples = {k: len(v) for k, v in data_sample.data.items()}

        return {
            "id":
            tagger_id,
            "tagger_path":
            relative_tagger_path,
            "precision":
            float(tagger.report.precision),
            "recall":
            float(tagger.report.recall),
            "f1_score":
            float(tagger.report.f1_score),
            "num_features":
            tagger.report.num_features,
            "num_examples":
            num_examples,
            "confusion_matrix":
            tagger.report.confusion.tolist(),
            "model_size":
            round(float(os.path.getsize(tagger_full_path)) / 1000000,
                  1),  # bytes to mb
            "plot":
            str(image_path),
            "classes":
            tagger.report.classes
        }

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Exemple #28
0
def annotator_task(self, annotator_task_id):
    annotator_obj = Annotator.objects.get(pk=annotator_task_id)
    annotator_group_children = []

    indices = annotator_obj.get_indices()
    users = [user.pk for user in annotator_obj.annotator_users.all()]

    task_object = annotator_obj.task
    annotator_fields = json.loads(annotator_obj.fields)
    all_fields = annotator_fields
    all_fields.append("texta_meta.document_uuid")

    if annotator_obj.annotation_type == 'entity':
        all_fields.append("texta_facts")
        all_fields.append(texta_mlp.settings.META_KEY)  # Include MLP Meta key here so it would be pulled from Elasticsearch.

    project_obj = Project.objects.get(id=annotator_obj.project_id)
    new_field_type = get_selected_fields(indices, annotator_fields)
    field_type = add_field_type(new_field_type)
    add_facts_mapping = annotator_obj.add_facts_mapping
    scroll_size = 100

    new_indices = []
    new_annotators = []

    for user in users:
        annotating_user = User.objects.get(pk=user)
        new_annotators.append(annotating_user.pk)
        for index in indices:
            new_indices.append(f"{index}_{user}_{annotator_obj.task_id}")

    query = annotator_obj.query

    logging.getLogger(INFO_LOGGER).info(f"Starting task annotator with Task ID {annotator_obj.task_id}.")

    try:
        ec = ElasticCore()
        index_fields = ec.get_fields(indices)
        index_fields = [index_field["path"] for index_field in index_fields]

        # ElasticSearcher seems to be broken when handling scrolls with only the main field in its field_data instead of all of them in dot notation.
        # Hence this ugly hack is needed if I want to include the MLP meta field inside the output.
        for annotator_field in json.loads(annotator_obj.fields):
            for index_field in index_fields:
                stripped_mlp_field = annotator_field.split("_mlp.")[0] if "_mlp." in annotator_field else annotator_field
                if texta_mlp.settings.META_KEY in index_field and stripped_mlp_field in index_field:
                    all_fields.append(index_field)

        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step("scrolling data")
        show_progress.update_view(0)

        __add_meta_to_original_index(indices, index_fields, show_progress, query, scroll_size, ec)

        for new_annotator in new_annotators:
            new_annotator_obj = Annotator.objects.create(
                annotator_uid=f"{annotator_obj.description}_{new_annotator}_{annotator_obj.task_id}",
                description=f"{annotator_obj.description}",
                author=annotator_obj.author,
                project=annotator_obj.project,
                total=annotator_obj.total,
                fields=annotator_obj.fields,
                add_facts_mapping=add_facts_mapping,
                annotation_type=annotator_obj.annotation_type,
                binary_configuration=annotator_obj.binary_configuration,
                multilabel_configuration=annotator_obj.multilabel_configuration,
                entity_configuration=annotator_obj.entity_configuration,
            )
            new_annotator_obj.annotator_users.add(new_annotator)
            for new_index in new_indices:
                logging.getLogger(INFO_LOGGER).info(f"New Index check {new_index} for user {new_annotator}")
                logging.getLogger(INFO_LOGGER).info(f"Index object {indices}")

                for index in indices:
                    if new_index == f"{index}_{new_annotator}_{annotator_obj.task_id}":

                        elastic_search = ElasticSearcher(indices=indices, field_data=all_fields, callback_progress=show_progress, query=query, scroll_size=scroll_size)
                        elastic_doc = ElasticDocument(new_index)

                        logging.getLogger(INFO_LOGGER).info(f"Updating index schema for index {new_index}")
                        ''' the operations that don't require a mapping update have been completed '''
                        schema_input = update_field_types(indices, all_fields, field_type, flatten_doc=False)
                        updated_schema = update_mapping(schema_input, new_index, add_facts_mapping, add_texta_meta_mapping=True)

                        logging.getLogger(INFO_LOGGER).info(f"Creating new index {new_index} for user {new_annotator}")
                        # create new_index
                        create_index_res = ElasticCore().create_index(new_index, updated_schema)

                        index_model, is_created = Index.objects.get_or_create(name=new_index)
                        project_obj.indices.add(index_model)
                        index_user = index_model.name.rsplit('_', 2)[1]
                        if str(index_user) == str(new_annotator):
                            new_annotator_obj.indices.add(index_model)

                        logging.getLogger(INFO_LOGGER).info("Indexing documents.")
                        # set new_index name as mapping name
                        bulk_add_documents(elastic_search, elastic_doc, index=new_index, chunk_size=scroll_size, flatten_doc=False)

            new_annotator_obj.save()
            annotator_group_children.append(new_annotator_obj.id)
            logging.getLogger(INFO_LOGGER).info(f"Saving new annotator object ID {new_annotator_obj.id}")

        new_annotator_obj.add_annotation_mapping(new_indices)
        new_annotator_obj.add_texta_meta_mapping(new_indices)

        annotator_obj.annotator_users.clear()
        annotator_obj.save()

        annotator_group, is_created = AnnotatorGroup.objects.get_or_create(project=annotator_obj.project, parent=annotator_obj)
        annotator_group.children.add(*annotator_group_children)

        # declare the job done
        task_object.complete()

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e

    logging.getLogger(INFO_LOGGER).info(f"Annotator with Task ID {annotator_obj.task_id} successfully completed.")
    return True