Ejemplo n.º 1
0
def do_topic_model_related_task(
        task_args: TopicModelTrainingTaskArgs,
        processing_opts: TopicModelProcessingOptions) -> None:
    topic_mdl = models.TopicModel.get(
        models.TopicModel.id_ == task_args["topic_model_id"])
    assert topic_mdl.lda_set is not None
    try:
        preprocessing_opts = LDAPreprocessingOptions(
            remove_phrases=False,
            join_phrases=True
            if len(processing_opts['phrases_to_join']) > 0 else False,
            remove_punctuation_and_digits=processing_opts[
                'remove_punctuation'],
            remove_stopwords=processing_opts['remove_stopwords'],
            lemmatize_content=processing_opts['do_lemmatizing']
            if task_args['language'] == 'english' else False,
        )  # nothing being done about 'do_stemming'
        corpus = Corpus(file_name=task_args["training_file"],
                        content_column_name=Settings.CONTENT_COL,
                        id_column_name=Settings.ID_COL,
                        language=task_args["language"],
                        extra_stopwords=processing_opts['extra_stopwords'],
                        phrases_to_join=processing_opts['phrases_to_join'],
                        min_word_length=processing_opts['min_word_length'],
                        processing_to_do=preprocessing_opts)
        lda_modeler = LDAModeler(
            corpus,
            iterations=task_args["iterations"],
            mallet_bin_directory=task_args["mallet_bin_directory"],
        )
    except Exception as e:
        app.logger.critical(f"Error while doing lda training task: {e}")

        topic_mdl.lda_set.error_encountered = True
    else:
        metrics = lda_modeler.model_topics_to_spreadsheet(
            num_topics=topic_mdl.num_topics,
            default_topic_names=topic_mdl.topic_names,
            fname_keywords=task_args["fname_keywords"],
            fname_topics_by_doc=task_args["fname_topics_by_doc"],
        )
        topic_mdl.lda_set.metrics = models.TopicModelMetrics.create(**metrics)
        topic_mdl.lda_set.lda_completed = True
        emailer = emails.Emailer()
        emailer.send_email(
            email_template_name="topic_model_training_finished",
            to_email=topic_mdl.notify_at_email,
            topic_model_name=topic_mdl.name,
            topic_model_id=topic_mdl.id_,
            topic_model_preview_url=
            f"http://{Settings.SERVER_NAME}/playground.html?step=1&id={topic_mdl.id_}",
            metrics=T.cast(T.Dict[str, T.Union[int, float]], metrics),
        )

    finally:
        topic_mdl.lda_set.save()
Ejemplo n.º 2
0
def do_topic_model_related_task(task_args: TopicModelTrainingTaskArgs) -> None:
    topic_mdl = models.TopicModel.get(
        models.TopicModel.id_ == task_args["topic_model_id"])
    assert topic_mdl.lda_set is not None
    try:
        corpus = Corpus(
            file_name=task_args["training_file"],
            content_column_name=Settings.CONTENT_COL,
            id_column_name=Settings.ID_COL,
        )
        lda_modeler = LDAModeler(
            corpus,
            iterations=task_args["iterations"],
            mallet_bin_directory=task_args["mallet_bin_directory"],
        )
    except BaseException as e:
        logger.critical(f"Error while doing lda training task: {e}")
        topic_mdl.lda_set.error_encountered = True
    else:
        metrics = lda_modeler.model_topics_to_spreadsheet(
            num_topics=topic_mdl.num_topics,
            default_topic_names=topic_mdl.topic_names,
            fname_keywords=task_args["fname_keywords"],
            fname_topics_by_doc=task_args["fname_topics_by_doc"],
        )
        topic_mdl.lda_set.metrics = models.TopicModelMetrics.create(**metrics)
        topic_mdl.lda_set.lda_completed = True
        emailer = emails.Emailer()
        emailer.send_email(
            email_template_name="topic_model_training_finished",
            to_email=topic_mdl.notify_at_email,
            topic_model_name=topic_mdl.name,
            topic_model_preview_url=
            f"http://{Settings.SERVER_NAME}/topicModelPreviews.html?topic_model_id={topic_mdl.id_}",
            metrics=T.cast(T.Dict[str, T.Union[int, float]], metrics),
        )

    finally:
        topic_mdl.lda_set.save()
Ejemplo n.º 3
0
def do_classifier_related_task(
    task_args: T.Union[ClassifierTrainingTaskArgs,
                       ClassifierPredictionTaskArgs],
) -> None:
    if task_args["task_type"] == "prediction":
        test_set = models.TestSet.get(
            models.TestSet.id_ == task_args["test_set_id"])
        assert test_set.inference_began
        assert not test_set.inference_completed

        try:
            classifier_model = ClassifierModel(
                labels=task_args["labels"],
                model_path=task_args["model_path"],
                cache_dir=task_args["cache_dir"],
            )
            app.logger.info('reached here')
            classifier_model.predict_and_save_predictions(
                test_set_path=task_args["test_file"],
                content_column=Settings.CONTENT_COL,
                predicted_column=Settings.PREDICTED_LABEL_COL,
                output_file_path=task_args["test_output_file"],
            )
        except BaseException as e:
            app.logger.critical(f"Error while doing prediction task: {e}")
            test_set.error_encountered = True
        else:
            test_set.inference_completed = True
            emailer = emails.Emailer()
            emailer.send_email(
                email_template_name="classifier_inference_finished",
                to_email=test_set.notify_at_email,
                classifier_name=test_set.classifier.name,
                classifier_id=test_set.classifier.classifier_id,
                predictions_url=url_for(
                    "ClassifiersTestSetsPredictions",
                    classifier_id=test_set.classifier.classifier_id,
                    test_set_id=test_set.id_,
                    file_type=Settings.DEFAULT_FILE_FORMAT.strip("."),
                    _method="GET",
                ),
            )
        finally:
            test_set.save()

    elif task_args["task_type"] == "training":
        app.logger.info('hererer')
        assert task_args["task_type"] == "training"
        clsf = models.Classifier.get(
            models.Classifier.classifier_id == task_args["classifier_id"])
        assert clsf.train_set is not None
        # assert clsf.dev_set is not None

        try:
            classifier_model = ClassifierModel(
                labels=task_args["labels"],
                num_train_epochs=task_args["num_train_epochs"],
                model_path=task_args["model_path"],
                train_file=task_args["train_file"],
                dev_file=task_args["dev_file"],
                cache_dir=task_args["cache_dir"],
                output_dir=task_args["output_dir"],
            )
            metrics = classifier_model.perform_cv_and_train()
        except BaseException as e:
            app.logger.critical(
                f"Error while doing classifier training task: {e}")
            clsf.train_set.error_encountered = True
            # clsf.dev_set.error_encountered = True
        else:
            clsf.train_set.training_or_inference_completed = True
            # clsf.dev_set.training_or_inference_completed = True
            clsf.train_set.metrics = models.ClassifierMetrics(**metrics)
            clsf.train_set.metrics.save()
            emailer = emails.Emailer()
            emailer.send_email(
                email_template_name="classifier_training_finished",
                to_email=clsf.notify_at_email,
                classifier_name=clsf.name,
                classifier_id=clsf.classifier_id,
                metrics=T.cast(T.Dict[str, float], metrics),
            )

        finally:
            # clsf.dev_set.save()
            clsf.train_set.save()
            clsf.save()
Ejemplo n.º 4
0
 def setUp(self) -> None:
     super().setUp()
     self._emailer = emails.Emailer()