def __create_articles() -> Articles:
        article1 = Article("1", "Il s'agit d'un titre", "Le résumé numéro un.",
                           ["theme1"], ["theme1", "old_prediction"], [])
        article2 = Article("2", "Ce sont deux titres",
                           "Le résumé numéro deux.",
                           ["theme1", "theme2", "theme3"], ["other_theme"], [])

        return Articles([article1, article2])
    def test_articles_not_modified_by_predictor(self):
        """
        Test if articles fields 'themes' and 'verified_themes' are not modified
        by the predictor.
        :return:
        """

        tokenizer_init_article = Article(
            id="0",
            title="",
            summary="theme1 theme2 theme3",
            themes=["theme1", "theme2", "theme3"],
            verified_themes=["theme1", "theme2", "theme3"],
            predicted_themes=[])

        articleOne = Article(id="1",
                             title="",
                             summary="theme1 theme2",
                             themes=["one", "two"],
                             verified_themes=["one", "two", "three"],
                             predicted_themes=["three"])

        article_tokenizer = ArticleTextTokenizer(
            Articles([tokenizer_init_article]), 3)
        theme_tokenizer = ArticleThemeTokenizer(
            Articles([tokenizer_init_article]))

        predictor = ArticlePredictor(
            classifier_model=MockModel.get_model(),
            supported_themes=["theme1", "theme2", "theme3"],
            preprocessor=MockPreprocessor(),
            article_tokenizer=article_tokenizer,
            theme_tokenizer=theme_tokenizer)

        prediction = predictor.predict_preprocessed(
            Articles(article=articleOne))

        article_with_predictions = prediction.get_articles_with_predictions(
        )[0]

        self.assertEqual(["one", "two"], article_with_predictions.themes)
        self.assertEqual(["one", "two", "three"],
                         article_with_predictions.verified_themes)
        self.assertEqual(["theme1", "theme2"],
                         article_with_predictions.predicted_themes)
Beispiel #3
0
    def __init__(self, X: List[List[Optional[Any]]],
                 Y: List[List[Optional[Any]]], articles: Articles,
                 validation_ratio: float, batch_size):
        """
        Creates and wrap a tensorflow dataset.
        :param X: Input
        :param Y: Outputs
        :param validation_ratio:
        :param batch_size:
        """
        if len(X) == 0:
            raise Exception("X matrix has not rows!")

        self.row_count: int = len(X)
        self.article_length: int = len(X[0])
        self.theme_count: int = len(Y[0])

        self.train_ratio: float = 1 - validation_ratio

        self.train_size = math.ceil(self.train_ratio * self.row_count)
        self.validation_size = math.ceil(validation_ratio * self.row_count)

        self.train_batch_count = int(math.ceil(self.train_size / batch_size))
        self.validation_batch_count = int(
            math.ceil(self.validation_size / batch_size))

        self.X_train = X[:self.train_size]
        self.Y_train = Y[:self.train_size]
        self.X_val = X[self.train_size:]
        self.Y_val = Y[self.train_size:]
        self.articles_train: Articles = Articles(articles[:self.train_size])
        self.articles_validation: Articles = Articles(
            articles[self.train_size:])

        # tf.Datasets creation

        # Only train shuffle. Not needed to evaluate.
        self.trainData = tf.data.Dataset.from_tensor_slices((self.X_train, self.Y_train))\
            .shuffle(len(self.X_train))\
            .batch(batch_size)\
            .repeat()

        self.validationData = tf.data.Dataset.from_tensor_slices((self.X_val, self.Y_val))\
            .batch(batch_size)\
            .repeat()
Beispiel #4
0
    def __init__(self, articles: Articles):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(articles.themes())

        self.one_hot_matrix = self.tokenizer.texts_to_matrix(articles.themes())

        # Remove the first column, whose first col contains only 0s.
        self.one_hot_matrix = np.delete(arr=self.one_hot_matrix, obj=0, axis=1)

        # Create ordered list of theme as in tokenizer
        self.orderedThemes: List[str] = []

        for i in range(1,
                       len(self.tokenizer.word_index) +
                       1):  # word_index start at 1, 0 is reserved.
            self.orderedThemes.append(self.tokenizer.index_word[i])

        self.themes_count = len(self.tokenizer.word_index)
    def __init__(self, articles: Articles, max_article_length: int):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(articles.title_and_summary())
        self.max_article_length: int = max_article_length

        self.sequences = self.transform_to_sequences(articles)
        self.voc_size = len(
            self.tokenizer.word_index) + 1  # +1 because we pad with 0.
        self.document_count = self.tokenizer.document_count
 def transform_to_sequences(
         self,
         preprocessed_articles: Articles) -> List[List[Optional[Any]]]:
     """Transform articles content to a padded vector of length "max_article_length"."""
     matrix = self.tokenizer.texts_to_sequences(
         preprocessed_articles.title_and_summary())
     matrix = keras.preprocessing.sequence.pad_sequences(
         matrix, value=0, padding='post', maxlen=self.max_article_length)
     return matrix
 def process_articles(self, articles: Articles) -> Articles:
     """
     Remove stopwords and do lemmatization on each article, for the specified language.
     :param articles: data_models to process
     :param LANG: language of the data_models
     :return: processed data_models
     """
     p = Pool(8)
     return Articles(p.map(self.process_article, articles.items))
Beispiel #8
0
class ArticlesPrediction:

    raw_predictions: Dict[str, List[float]] = {}
    __theme_tokenizer: ArticleThemeTokenizer
    articles: Articles = Articles()

    def __init__(self, theme_tokenizer: ArticleThemeTokenizer, articles: Articles):
        self.__theme_tokenizer = theme_tokenizer
        self.articles = articles

    def addPredictionsForArticle(self, predictions: List[float], article_id: str):
        """
        Add the predictions for that article.
        :param article_id:
        :param predictions:
        """
        self.raw_predictions[article_id] = predictions

    def get_articles_with_predictions(self, threshold: float = 0.5) -> Articles:
        return self.__apply_on_articles(threshold)


    def __apply_on_article(self, article: Article, threshold: float):
        """
        Apply the predictions on an article.
        :param article:
        :param threshold: Min probability to consider a theme as positively predicted.
        """
        if article.id not in self.raw_predictions.keys():
            raise Exception("No prediction found for that article (%s)", article.id)

        article.predicted_themes = self.__transform_to_themes(self.raw_predictions[article.id], threshold)

    def __apply_on_articles(self, threshold: float) -> Articles:
        """
        Apply the predictions on articles.
        :param threshold: Min probability to consider a theme as positively predicted.
        """
        articles = self.articles.deep_copy()

        for article in articles:
            self.__apply_on_article(article, threshold)

        return articles


    def __transform_to_themes(self, predictions: List[float], threshold: float) -> List[str]:
        """
        Transforms predictions that are under a form of probabilities into a list of them in a string form.
        :param predictions: Predictions for a single article
        :param threshold: Min probability to consider a theme as positively predicted.
        :return:
        """
        boolean_vector = list(map(lambda probability: probability >= threshold, predictions))
        return self.__theme_tokenizer.boolean_vector_to_themes(boolean_vector)
Beispiel #9
0
 def process_article(self, article: Article) -> Article:
     """
     Remove stopwords and do lemmatization on one single article, for the specified language.
     :param article: article to process
     :return: processed article
     """
     while(True):
         try:
             return self.__execute_swift_program(Articles(article=article)).items[0]
         except:
             continue
    def test_save(self):
        article1 = Article("1", "title", "summary",
                           ["theme1", "theme2", "theme3"], [], [])
        article2 = Article("2", "title", "summary", ["theme1", "theme4"], [],
                           [])
        articles = Articles([article1, article2])
        tokenizer = ArticleThemeTokenizer(articles)

        tokenizer.save("test.json")
        with open("test.json", "r") as file:
            content = file.readlines()
            print("f")
Beispiel #11
0
    def __execute_swift_program(self, articles: Articles) -> Articles:
        (input_file, input_path) = tempfile.mkstemp()
        (output_file, output_path) = tempfile.mkstemp()

        os.close(input_file)
        os.close(output_file)

        articles.save(input_path)
        self.logger.info(f"Articles about to be processed available at {input_path}.")

        command_directory = os.path.dirname(os.path.abspath(__file__))
        command_path = f"{command_directory}/ArticlePreprocessorTool"

        with subprocess.Popen([command_path, input_path, output_path], stdout=subprocess.PIPE) as process:
            while True:
                output = process.stdout.readline()
                #print(output)
                if process.poll() is not None:
                    break
                if output:
                    print(output.strip(), end="\r")

        print("", end="\r")
        self.logger.info("Finished processing %d articles.", articles.count())

        self.logger.info(f"Preprocessed articles available at {output_path}.")


        try:
            processed_articles = Articles.from_file(output_path)
            self.failed_attemps = 0
            return processed_articles
        except JSONDecodeError:
            self.logger.error(f"Failed to read the processed articles.... trying again (attemp {self.failed_attemps})")
            self.failed_attemps += 1
            if self.failed_attemps > 5:
                raise
            else:
                return self.__execute_swift_program(articles)
Beispiel #12
0
    def predict(self, articles_original: Articles) -> ArticlesPrediction:
        """
        Pre-processes articles, compute the predictions for each of them and aggregate the predictions into a
        ArticlesPrediction object, which is returned.
        :param articles_original: NON-preprocessed articles
        """
        predictions = ArticlesPrediction(self.theme_tokenizer, articles_original)
        processed_articles = Articles([article for article in self.preprocessor.process_articles(articles_original)])

        self.logger.debug("Will start predictions with keras model")
        matrix = self.article_tokenizer.transform_to_sequences(processed_articles)
        prediction_matrix = self.classifier_model.predict(matrix)
        self.logger.debug("Did predictions with keras model")

        idx = 0
        for prediction_vector in prediction_matrix:
            article_id = processed_articles[idx].id
            predictions.addPredictionsForArticle(prediction_vector, article_id)

            idx += 1

        self.logger.info("Finished predicting themes for %d articles", articles_original.count())
        return predictions
    def test_substraction(self):
        articles = self.create_articles()

        articles_to_remove = Articles(self.create_articles()[0:2])

        filtered_articles = articles - articles_to_remove

        self.assertEqual(filtered_articles.count() + 2, articles.count())
        self.assertFalse(filtered_articles.contains(articles_to_remove[0].id))
        self.assertFalse(filtered_articles.contains(articles_to_remove[1].id))
        self.assertTrue(filtered_articles.contains(articles[2].id))
        self.assertTrue(filtered_articles.contains(articles[3].id))
        self.assertTrue(filtered_articles.contains(articles[4].id))
        self.assertTrue(filtered_articles.contains(articles[5].id))
    def test_boolean_vector_to_themes(self):
        article1 = Article("1", "title", "summary",
                           ["theme1", "theme2", "theme3"], [], [])
        article2 = Article("2", "title", "summary", ["theme1", "theme4"], [],
                           [])
        articles = Articles([article1, article2])

        tokenizer = ArticleThemeTokenizer(articles)

        self.assertEqual(4, tokenizer.themes_count)
        self.assertEqual(["theme1", "theme2", "theme3", "theme4"],
                         tokenizer.orderedThemes)
        self.assertEqual(["theme1", "theme4"],
                         tokenizer.boolean_vector_to_themes(
                             [True, False, False, True]))
        self.assertEqual([],
                         tokenizer.boolean_vector_to_themes(
                             [False, False, False, False]))
        self.assertEqual(["theme3"],
                         tokenizer.boolean_vector_to_themes(
                             [False, False, True, False]))
    def create_articles() -> Articles:
        article1 = Article(title="Title",
                           summary="summary",
                           themes=[],
                           verified_themes=[],
                           predicted_themes=[],
                           id="1")
        article2 = Article(title="Title",
                           summary="summary",
                           themes=["T"],
                           verified_themes=["T"],
                           predicted_themes=[],
                           id="2")
        article3 = Article(title="Title",
                           summary="summary",
                           themes=["T", "T2"],
                           verified_themes=[],
                           predicted_themes=[],
                           id="3")
        article4 = Article(title="Title",
                           summary="summary",
                           themes=[],
                           verified_themes=["T"],
                           predicted_themes=[],
                           id="4")
        article5 = Article(title="Title",
                           summary="summary",
                           themes=["T2"],
                           verified_themes=["T"],
                           predicted_themes=[],
                           id="5")
        article6 = Article(title="Title",
                           summary="summary",
                           themes=["T", "T2", "T3"],
                           verified_themes=["T", "T2", "T3"],
                           predicted_themes=["T3"],
                           id="6")

        return Articles(
            [article1, article2, article3, article4, article5, article6])
Beispiel #16
0
    def predict_preprocessed(self, processed_articles: Articles) -> ArticlesPrediction:
        """
        Compute the predictions for articles of them and aggregate the predictions into a
        ArticlesPrediction object, which is returned.
        Articles must have been previously pre-processed!
        :param processed_articles: Preprocessed articles
        """
        predictions = ArticlesPrediction(self.theme_tokenizer, processed_articles)

        self.logger.debug("Will start predictions with keras model")
        matrix = self.article_tokenizer.transform_to_sequences(processed_articles)
        prediction_matrix = self.classifier_model.predict(matrix)
        self.logger.debug("Did predictions with keras model")

        idx = 0
        for prediction_vector in prediction_matrix:
            article_id = processed_articles[idx].id
            predictions.addPredictionsForArticle(prediction_vector, article_id)

            idx += 1

        self.logger.info("Finished predicting themes for %d articles", processed_articles.count())
        return predictions
from tensorflow.keras.models import load_model

ARTICLE_JSON_FILE = "articles_{}.json"
LANG = "fr"
LANG_FULL = "french"
MODEL_PATH = "model.h5" # Relative path

LIMIT_ARTICLE_COUNT = None # None or a number.

SUPPORTED_THEMES: List[str] = ["computer", "smartphone"]


# Loads the articles
# ==================
articles_filepath = ARTICLE_JSON_FILE.format(LANG)

if (LIMIT_ARTICLE_COUNT is None):
    all_articles: Articles = Articles.from_file(articles_filepath)
else:
    all_articles: Articles = Articles.from_file(articles_filepath, LIMIT_ARTICLE_COUNT)


# Load the model
# ==================

model = load_model(MODEL_PATH, custom_objects={"WeightedBinaryCrossEntropy" : WeightedBinaryCrossEntropy()})

# Perform evaluation
# ==================

F1AUCModelEvaluator().evaluate(all_articles, SUPPORTED_THEMES)
Beispiel #18
0
    def testApplyOnArticlesDefaultThreshold(self):
        article1 = Article("1", "title", "summary", ["theme1"],
                           ["theme1", "old_prediction"], ["theme1"])
        article2 = Article("2", "title", "summary", ["theme1", "theme2"],
                           ["other__old_predicted_theme"],
                           ["theme1", "theme2", "theme3"])

        # article 3 is not used for test, but is necessary for the tokenizer to know the theme3.
        article3 = Article("3", "title", "summary", ["theme3"], [], [])

        articles = Articles([article1, article2])

        theme_tokenizer = ArticleThemeTokenizer(
            Articles([article1, article2, article3]))

        predictions = ArticlesPrediction(theme_tokenizer, articles)
        predictions.addPredictionsForArticle([0.1, 0.7, 0], article1.id)
        predictions.addPredictionsForArticle([0.4, 0.89, 0.99], article2.id)

        # Apply prediction with standard threshold
        predicted_articles = predictions.get_articles_with_predictions()
        predicted_articles_one = predicted_articles[0]
        predicted_articles_two = predicted_articles[1]

        self.assertEqual(1, len(predicted_articles_one.predicted_themes))
        self.assertFalse("theme1" in predicted_articles_one.predicted_themes)
        self.assertTrue("theme2" in predicted_articles_one.predicted_themes)
        self.assertFalse("theme3" in predicted_articles_one.predicted_themes)

        self.assertEqual(2, len(predicted_articles_two.predicted_themes))
        self.assertFalse("theme1" in predicted_articles_two.predicted_themes)
        self.assertTrue("theme2" in predicted_articles_two.predicted_themes)
        self.assertTrue("theme3" in predicted_articles_two.predicted_themes)

        # Check that the 'verified themes' and 'themes' are not touched!
        self.assertEqual(1, len(predicted_articles_one.themes))
        self.assertTrue("theme1" in predicted_articles_one.themes)
        self.assertFalse("theme2" in predicted_articles_one.themes)
        self.assertFalse("theme3" in predicted_articles_one.themes)

        self.assertEqual(2, len(predicted_articles_two.themes))
        self.assertTrue("theme1" in predicted_articles_two.themes)
        self.assertTrue("theme2" in predicted_articles_two.themes)
        self.assertFalse("theme3" in predicted_articles_two.themes)

        self.assertEqual(1, len(predicted_articles_one.verified_themes))
        self.assertTrue("theme1" in predicted_articles_one.verified_themes)
        self.assertFalse("theme2" in predicted_articles_one.verified_themes)
        self.assertFalse("theme3" in predicted_articles_one.verified_themes)

        self.assertEqual(3, len(predicted_articles_two.verified_themes))
        self.assertTrue("theme1" in predicted_articles_two.verified_themes)
        self.assertTrue("theme2" in predicted_articles_two.verified_themes)
        self.assertTrue("theme3" in predicted_articles_two.verified_themes)

        # Apply prediction with custom threshold
        predicted_articles = predictions.get_articles_with_predictions(0.09)

        predicted_articles_one = predicted_articles[0]
        predicted_articles_two = predicted_articles[1]

        self.assertEqual(2, len(predicted_articles_one.predicted_themes))
        self.assertTrue("theme1" in predicted_articles_one.predicted_themes)
        self.assertTrue("theme2" in predicted_articles_one.predicted_themes)
        self.assertFalse("theme3" in predicted_articles_one.predicted_themes)

        self.assertEqual(3, len(predicted_articles_two.predicted_themes))
        self.assertTrue("theme1" in predicted_articles_two.predicted_themes)
        self.assertTrue("theme2" in predicted_articles_two.predicted_themes)
        self.assertTrue("theme3" in predicted_articles_two.predicted_themes)
 def process_articles(self, articles: Articles) -> Articles:
     return articles.deep_copy()
debugLogger.info(
    "\n\n\n####################################\n####################################"
)

############################################
# Data loading
############################################

# Loading the file
# ============================
debugLogger.info("Loading the file")

articles_filepath = ARTICLE_JSON_FILE.format(LANG)

if LIMIT_ARTICLES_TRAINING:
    all_articles: Articles = Articles.from_file(articles_filepath, 600)
else:
    all_articles: Articles = Articles.from_file(articles_filepath)

all_articles.shuffle()

for article in all_articles:
    article.make_immutable()

# Data filtering and partitionning
# ============================

articles_train: Articles = all_articles.articles_with_all_verified_themes(
    [SUPPORTED_THEME]).deep_copy()

# Removal of all unsupported themes and keep only data_models who have at least one supported theme.