Beispiel #1
0
    def reevaluate(self, request, pk=None, project_pk=None):
        """Starts re-evaluation task for the Evaluator model."""
        evaluator = self.get_object()
        es_timeout = evaluator.es_timeout
        scroll_size = evaluator.scroll_size
        query = json.loads(evaluator.query)
        evaluation_type = evaluator.evaluation_type
        indices = get_indices_from_object(evaluator)

        if evaluation_type == choices.ENTITY_EVALUATION:
            evaluate_entity_tags_task.apply_async(args=(evaluator.pk, indices, query, es_timeout, scroll_size), queue=CELERY_LONG_TERM_TASK_QUEUE)
        else:
            evaluate_tags_task.apply_async(args=(evaluator.pk, indices, query, es_timeout, scroll_size), queue=CELERY_LONG_TERM_TASK_QUEUE)

        return Response({"success": "Re-evaluation task created"}, status=status.HTTP_200_OK)
Beispiel #2
0
def train_embedding(embedding_id):
    # retrieve embedding & task objects
    embedding_object = Embedding.objects.get(pk=embedding_id)
    task_object = embedding_object.task
    show_progress = ShowProgress(task_object, multiplier=1)
    show_progress.update_step('training')
    show_progress.update_view(0)
    try:
        # retrieve indices from project
        indices = get_indices_from_object(embedding_object)
        field_data = json.loads(embedding_object.fields)
        max_documents = embedding_object.max_documents
        use_phraser = embedding_object.use_phraser
        snowball_language = embedding_object.snowball_language
        # add stemmer if asked
        if snowball_language:
            snowball_lemmatizer = ElasticAnalyzer(language=snowball_language)
        else:
            snowball_lemmatizer = None
        # iterator for texts
        sentences = ElasticSearcher(query=json.loads(embedding_object.query),
                                    indices=indices,
                                    field_data=field_data,
                                    callback_progress=show_progress,
                                    scroll_limit=max_documents,
                                    text_processor=TextProcessor(
                                        sentences=True,
                                        remove_stop_words=True,
                                        words_as_list=True,
                                        lemmatizer=snowball_lemmatizer),
                                    output=ElasticSearcher.OUT_TEXT)
        # create embedding object & train
        embedding = embedding_object.get_embedding()
        embedding.train(sentences, use_phraser=use_phraser)

        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # save model
        show_progress.update_step('saving')
        full_model_path, relative_model_path = embedding_object.generate_name(
            "embedding")
        embedding.save(full_model_path)

        # save gensim model
        if embedding_object.embedding_type == "FastTextEmbedding":
            fast_text_embedding_model = joblib.load(
                full_model_path)["embedding"]
            gensim_full_model_path = full_model_path + "_" + FACEBOOK_MODEL_SUFFIX
            gensim.models.fasttext.save_facebook_model(
                fast_text_embedding_model,
                gensim_full_model_path,
                encoding='utf-8')

        # save model path
        embedding_object.embedding_model.name = relative_model_path
        embedding_object.vocab_size = embedding.model.wv.vectors.shape[0]
        embedding_object.save()
        # declare the job done
        task_object.complete()
        return True
    except Exception as e:
        # declare the job failed
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise
Beispiel #3
0
def train_tagger_task(tagger_id: int):
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'train_tagger' for tagger with ID: {tagger_id}!")
    tagger_object = Tagger.objects.get(id=tagger_id)
    task_object = tagger_object.task
    try:
        # create progress object
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling positives')
        show_progress.update_view(0)

        # retrieve indices & field data
        indices = get_indices_from_object(tagger_object)
        field_data = json.loads(tagger_object.fields)
        # split stop words by space or newline and remove empties

        stop_words = load_stop_words(tagger_object.stop_words)
        ignore_numbers = tagger_object.ignore_numbers

        # get scoring function
        if tagger_object.scoring_function != "default":
            scoring_function = tagger_object.scoring_function
        else:
            scoring_function = None

        logging.getLogger(INFO_LOGGER).info(
            f"Using scoring function: {scoring_function}.")

        # load embedding if any
        if tagger_object.embedding:
            embedding = W2VEmbedding()
            embedding.load_django(tagger_object.embedding)
        else:
            embedding = None
        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices=indices,
            field_data=field_data,
            show_progress=show_progress,
            snowball_language=tagger_object.snowball_language,
            detect_lang=tagger_object.detect_lang,
            balance=tagger_object.balance,
            balance_to_max_limit=tagger_object.balance_to_max_limit)
        # update status to training
        show_progress.update_step("training")
        show_progress.update_view(0)
        # train model
        tagger = TextTagger(embedding=embedding,
                            custom_stop_words=stop_words,
                            ignore_numbers=ignore_numbers,
                            classifier=tagger_object.classifier,
                            vectorizer=tagger_object.vectorizer,
                            analyzer=tagger_object.analyzer)
        tagger.train(data_sample.data,
                     pos_label=tagger_object.pos_label,
                     field_list=field_data,
                     scoring=scoring_function)

        # save tagger to disk
        tagger_full_path, relative_tagger_path = tagger_object.generate_name(
            "tagger")
        tagger.save(tagger_full_path)

        # Save the image before its path.
        image_name = f'{secrets.token_hex(15)}.png'
        tagger_object.plot.save(image_name,
                                create_tagger_plot(tagger.report.to_dict()),
                                save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name

        # get num examples
        num_examples = {k: len(v) for k, v in data_sample.data.items()}

        return {
            "id":
            tagger_id,
            "tagger_path":
            relative_tagger_path,
            "precision":
            float(tagger.report.precision),
            "recall":
            float(tagger.report.recall),
            "f1_score":
            float(tagger.report.f1_score),
            "num_features":
            tagger.report.num_features,
            "num_examples":
            num_examples,
            "confusion_matrix":
            tagger.report.confusion.tolist(),
            "model_size":
            round(float(os.path.getsize(tagger_full_path)) / 1000000,
                  1),  # bytes to mb
            "plot":
            str(image_path),
            "classes":
            tagger.report.classes
        }

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Beispiel #4
0
def train_bert_tagger(tagger_id, testing=False):
    # retrieve neurotagger & task objects
    tagger_object = BertTaggerObject.objects.get(pk=tagger_id)

    # Handle previous tagger models that exist in case of retrains.
    model_path = pathlib.Path(
        tagger_object.model.path) if tagger_object.model else None

    task_object = tagger_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        # get fields & indices
        fields = json.loads(tagger_object.fields)
        indices = get_indices_from_object(tagger_object)

        # set loading model from a checkpoint False by default
        from_checkpoint = False
        checkpoint_model = tagger_object.checkpoint_model

        pos_label = tagger_object.pos_label

        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices,
            fields,
            show_progress=show_progress,
            join_fields=True,
            balance=tagger_object.balance,
            use_sentence_shuffle=tagger_object.use_sentence_shuffle,
            balance_to_max_limit=tagger_object.balance_to_max_limit)
        show_progress.update_step('training')
        show_progress.update_view(0.0)

        # select sklearn average function based on the number of classes
        if data_sample.is_binary:
            sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_BINARY
        else:
            sklearn_avg_function = choices.DEFAULT_SKLEARN_AVG_MULTICLASS

        # if checkpoint model is detected, load it and use it for further training
        if checkpoint_model:
            logging.getLogger(INFO_LOGGER).info(
                f"Loading model from a checkpoint stored in '{tagger_object}'..."
            )

            # use the same pre-trained bert model as the checkpoint model
            tagger_object.bert_model = checkpoint_model.bert_model
            tagger = checkpoint_model.load_tagger()

            # set sklearn avg function in case the number of classes has changed
            tagger.sklearn_avg_function = sklearn_avg_function

            # set loading model from a checkpoint True
            from_checkpoint = True

        # if no checkpoint model is given, train a new model
        else:
            logging.getLogger(INFO_LOGGER).info(
                "No checkpoint model detected, training a new model...")
            # NB! saving pretrained models must be disabled!
            tagger = BertTagger(
                allow_standard_output=choices.DEFAULT_ALLOW_STANDARD_OUTPUT,
                autoadjust_batch_size=choices.DEFAULT_AUTOADJUST_BATCH_SIZE,
                sklearn_avg_function=sklearn_avg_function,
                use_gpu=tagger_object.use_gpu,
                save_pretrained=False,
                pretrained_models_dir=BERT_PRETRAINED_MODEL_DIRECTORY,
                logger=logging.getLogger(INFO_LOGGER),
                cache_dir=BERT_CACHE_DIR)

        # use state dict for binary taggers
        if data_sample.is_binary:
            tagger.config.use_state_dict = True
        else:
            tagger.config.use_state_dict = False
            pos_label = ""

        # train tagger and get result statistics
        report = tagger.train(data_sample.data,
                              from_checkpoint=from_checkpoint,
                              pos_label=pos_label,
                              n_epochs=tagger_object.num_epochs,
                              max_length=tagger_object.max_length,
                              batch_size=tagger_object.batch_size,
                              lr=tagger_object.learning_rate,
                              eps=tagger_object.eps,
                              split_ratio=tagger_object.split_ratio,
                              bert_model=tagger_object.bert_model)
        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # save tagger to disc
        tagger_path = os.path.join(
            BERT_FINETUNED_MODEL_DIRECTORY,
            f'{tagger_object.MODEL_TYPE}_{tagger_id}_{secrets.token_hex(10)}')
        tagger.save(tagger_path)

        # set tagger location
        tagger_object.model.name = tagger_path

        report_dict = report.to_dict()

        # save tagger plot
        tagger_object.plot.save(f'{secrets.token_hex(15)}.png',
                                create_tagger_plot(report_dict),
                                save=False)
        # save label index
        tagger_object.label_index = json.dumps(
            tagger.config.label_reverse_index)
        # stats to model object
        tagger_object.f1_score = report.f1_score
        tagger_object.precision = report.precision
        tagger_object.recall = report.recall
        tagger_object.accuracy = report.accuracy
        tagger_object.training_loss = report.training_loss
        tagger_object.validation_loss = report.validation_loss
        tagger_object.epoch_reports = json.dumps(
            [a.to_dict() for a in tagger.epoch_reports])
        tagger_object.num_examples = json.dumps(
            {k: len(v)
             for k, v in list(data_sample.data.items())})
        tagger_object.adjusted_batch_size = tagger.config.batch_size
        tagger_object.confusion_matrix = json.dumps(report.confusion.tolist())
        tagger_object.classes = json.dumps(report.classes, ensure_ascii=False)
        # save tagger object
        tagger_object.save()
        # declare the job done
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        tagger_object.task.add_error(error_message)
        tagger_object.task.update_status(Task.STATUS_FAILED)
        raise
Beispiel #5
0
def train_torchtagger(tagger_id, testing=False):
    try:
        # retrieve neurotagger & task objects
        tagger_object = TorchTaggerObject.objects.get(pk=tagger_id)

        # Handle previous tagger models that exist in case of retrains.
        model_path = pathlib.Path(tagger_object.model.path) if tagger_object.model else None

        task_object = tagger_object.task
        model_type = TorchTaggerObject.MODEL_TYPE
        show_progress = ShowProgress(task_object, multiplier=1)
        # get fields & indices
        fields = json.loads(tagger_object.fields)
        indices = get_indices_from_object(tagger_object)
        # load embedding
        embedding = W2VEmbedding()
        embedding.load_django(tagger_object.embedding)
        # create Datasample object for retrieving positive and negative sample
        data_sample = DataSample(
            tagger_object,
            indices,
            fields,
            show_progress=show_progress,
            join_fields=True,
            balance=tagger_object.balance,
            use_sentence_shuffle=tagger_object.use_sentence_shuffle,
            balance_to_max_limit=tagger_object.balance_to_max_limit
        )
        show_progress.update_step('training')
        show_progress.update_view(0.0)

        # get num examples and save to model
        num_examples = {k: len(v) for k, v in data_sample.data.items()}
        tagger_object.num_examples = json.dumps(num_examples)

        tagger_object.save()

        # create TorchTagger
        tagger = TorchTagger(
            embedding,
            model_arch=tagger_object.model_architecture
        )
        # train tagger and get result statistics
        report = tagger.train(data_sample.data, num_epochs=int(tagger_object.num_epochs), pos_label=tagger_object.pos_label)
        # close all db connections
        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()
        # save tagger to disk
        tagger_path = os.path.join(RELATIVE_MODELS_PATH, model_type, f'{model_type}_{tagger_id}_{secrets.token_hex(10)}')
        tagger.save(tagger_path)


        # set tagger location
        tagger_object.model.name = tagger_path
        # save tagger plot
        report_dict = report.to_dict()
        tagger_object.plot.save(f'{secrets.token_hex(15)}.png', create_tagger_plot(report_dict), save=False)
        # save label index
        tagger_object.label_index = json.dumps(tagger.label_reverse_index)
        # stats to model object
        tagger_object.f1_score = report.f1_score
        tagger_object.precision = report.precision
        tagger_object.recall = report.recall
        tagger_object.accuracy = report.accuracy
        tagger_object.training_loss = report.training_loss
        tagger_object.epoch_reports = json.dumps([a.to_dict() for a in tagger.epoch_reports])
        tagger_object.confusion_matrix = json.dumps(report.confusion.tolist())
        tagger_object.classes = json.dumps(report.classes, ensure_ascii=False)

        # save tagger object
        tagger_object.save()
        # declare the job done
        task_object.complete()

        # Cleanup after the transaction to ensure integrity database records.
        if model_path and model_path.exists():
            model_path.unlink(missing_ok=True)

        return True


    except Exception as e:
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise
Beispiel #6
0
def train_crf_task(crf_id: int):
    """
    Trains CRF model.
    """
    try:
        # get task object
        logging.getLogger(INFO_LOGGER).info(
            f"Starting task 'train_crf' for CRFExtractor with ID: {crf_id}!")
        crf_object = CRFExtractorObject.objects.get(id=crf_id)
        task_object = crf_object.task
        # create progress object
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling documents')
        show_progress.update_view(0)
        # retrieve indices & field data
        indices = get_indices_from_object(crf_object)
        mlp_field = crf_object.mlp_field

        # load embedding if any
        if crf_object.embedding:
            embedding = crf_object.embedding.get_embedding()
            embedding.load_django(crf_object.embedding)
        else:
            embedding = None

        # scroll docs
        logging.getLogger(INFO_LOGGER).info(
            f"Scrolling data for CRFExtractor with ID: {crf_id}!")
        documents = ElasticSearcher(query=crf_object.get_query(),
                                    indices=indices,
                                    callback_progress=show_progress,
                                    text_processor=None,
                                    field_data=[mlp_field, "texta_facts"],
                                    output=ElasticSearcher.OUT_DOC,
                                    flatten=False)

        # create config
        config = crf_object.get_crf_config()
        # start training
        logging.getLogger(INFO_LOGGER).info(
            f"Training the model for CRFExtractor with ID: {crf_id}!")
        # create extractor
        extractor = CRFExtractor(config=config, embedding=embedding)
        # train the CRF model
        model_full_path, relative_model_path = crf_object.generate_name("crf")
        report, _ = extractor.train(documents,
                                    save_path=model_full_path,
                                    mlp_field=mlp_field)
        # Save the image before its path.
        image_name = f'{secrets.token_hex(15)}.png'
        crf_object.plot.save(image_name,
                             create_tagger_plot(report.to_dict()),
                             save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        # pass results to next task
        return {
            "id":
            crf_id,
            "best_c_values":
            extractor.best_c_values,
            "extractor_path":
            relative_model_path,
            "precision":
            float(report.precision),
            "recall":
            float(report.recall),
            "f1_score":
            float(report.f1_score),
            "confusion_matrix":
            report.confusion.tolist(),
            "model_size":
            round(float(os.path.getsize(model_full_path)) / 1000000,
                  1),  # bytes to mb
            "plot":
            str(image_path),
        }
    except Exception as e:
        task_object.handle_failed_task(e)
        raise e