コード例 #1
0
ファイル: models.py プロジェクト: CheckFake/api
    def from_url(cls, url: str) -> 'WebPage':
        existing = cls.objects.filter(url=url).first()

        if existing and existing.content_score is None:
            raise APIException.info(
                'Cet article est en cours de traitement. Merci de réessayer dans quelques minutes.'
            )

        if (existing
                and existing.scores_version == WebPage.CURRENT_SCORES_VERSION
                and existing.updated_at >
                timezone.now() - datetime.timedelta(days=7)):
            logger.info(f"Returning existing object for url {url}")
            return existing

        elif not existing:
            base_domain = extract_base_domain(url)
            logger.debug(f"Base domain found {base_domain}")
            domain, created = BaseDomain.objects.get_or_create(
                base_domain=base_domain)
            existing = cls.objects.create(
                url=url,
                scores_version=WebPage.CURRENT_SCORES_VERSION,
                base_domain=domain,
                total_articles=0)

        try:
            return existing.compute_scores()
        except APIException as e:
            raise e
        except Exception as e:
            existing.delete()
            raise APIException.error("Erreur lors du calcul du score.",
                                     internal_message=str(e))
コード例 #2
0
ファイル: models.py プロジェクト: CheckFake/api
    def _compute_content_score(self, counter_nouns_article: Counter,
                               related_articles: dict, counter_article: int,
                               article: goose3.article.Article) -> None:
        nb_articles = 0
        interesting_articles = 0
        scores_new_articles = []
        dict_interesting_articles = {}
        parsed_uri = urlparse(self.url)
        logger.debug("URL parsed")
        g = Goose({
            'browser_user_agent':
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0"
        })
        blocked_counter = 0
        too_similar_counter = 0

        # Look for similar articles' url
        for link in related_articles['value']:
            linked_url = link['url']
            logger.debug("Found URL: %s", linked_url)

            if parsed_uri.netloc not in linked_url:
                try:
                    linked_article = g.extract(url=linked_url)
                    logger.debug("Name of the article: %s",
                                 linked_article.title)

                    if "You have been blocked" in linked_article.title:
                        logger.debug(
                            "Article 'You have been blocked' not considered")
                        blocked_counter += 1
                    elif SequenceMatcher(
                            None, article.cleaned_text,
                            linked_article.cleaned_text).ratio() > 0.3:
                        logger.debug(
                            "Article with content too similar not considered")
                        too_similar_counter += 1
                    else:
                        new_nouns_article = self.nouns(
                            linked_article.cleaned_text)
                        new_counter_nouns_articles = Counter(
                            self.tokens(new_nouns_article))
                        shared_items = [
                            k for k in counter_nouns_article
                            if k in new_counter_nouns_articles
                            and counter_nouns_article[k] > 1
                        ]
                        score_article = len(shared_items) / counter_article
                        if score_article > 0.4:
                            scores_new_articles.append(score_article)
                            interesting_articles += 1
                            dict_interesting_articles[linked_url] = (
                                linked_article.title, score_article)
                        else:
                            logger.debug("Too low score : %s", score_article)
                        nb_articles += 1
                        logger.debug("Percentage for new articles : %s",
                                     scores_new_articles)
                except (ValueError, LookupError, RequestException) as e:
                    logger.error(
                        f"Found page that can't be processed : {linked_url} with error message {e}"
                    )

        # Calcul du score de l'article
        if nb_articles == 0:
            self.delete()
            message = (
                "Nous n'avons trouvé que des articles trop similaires au vôtre. "
                "Il se peut qu'ils proviennent tous de la même source.")
            if blocked_counter > too_similar_counter:
                message = "Nous avons trouvé en majorité des articles dont nous n'avons pas pu extraire le contenu."
            raise APIException.info(message)
        elif interesting_articles == 0:
            content_score = 0
        else:
            content_score = (
                (int(interesting_articles / nb_articles * 1000) / 10) +
                min(100.0, (int(
                    (mean(scores_new_articles) * 1.5) * 1000) / 10))) / 2

        logger.debug("Article score : {}".format(content_score))
        self.content_score = content_score
        self.total_articles = nb_articles
        self._store_interesting_related_articles(dict_interesting_articles)
コード例 #3
0
ファイル: models.py プロジェクト: CheckFake/api
    def compute_scores(self) -> 'WebPage':
        logger.debug("Start compute_scores")
        # Extract the title and the text of the article
        g = Goose({
            'browser_user_agent':
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0"
        })
        try:
            article = g.extract(url=self.url)
        except InvalidSchema:
            self.delete()
            raise APIException.warning("Adresse invalide")
        except RequestException:
            self.delete()
            raise APIException.warning("Le site n'est pas joignable")

        # article_counter = Counter(self.tokens(article.cleaned_text))

        logger.debug("Text of the article : %s", article.cleaned_text)
        if article.cleaned_text == "":
            self.delete()
            raise APIException.warning(
                "Oups, nous n'avons pas pu extraire le texte de l'article.")

        nouns_article = self.nouns(article.cleaned_text)
        counter_nouns_article = Counter(self.tokens(nouns_article))
        logger.debug("Nouns in the article : %s", counter_nouns_article)

        related_articles = get_related_articles(article, 7)

        only_same_publisher = self.check_same_publisher(related_articles)

        if not related_articles.get("value") or only_same_publisher is True:
            logger.debug(
                "No article found, try with a period of 30 days before publishing."
            )
            related_articles = get_related_articles(article, 30)

            only_same_publisher = self.check_same_publisher(related_articles)
            if not related_articles.get(
                    "value") or only_same_publisher is True:
                isolated, created = IsolatedArticle.objects.get_or_create(
                    url=self.url, base_domain=self.base_domain)
                self.delete()
                raise APIException.info(
                    "Cet article semble isolé, nous n'avons trouvé aucun article en lien avec lui. "
                    "Faites attention!")

        logger.debug("Articles found %s", related_articles)

        counter_article = 0
        for word in counter_nouns_article:
            if counter_nouns_article[word] > 1:
                counter_article += 1
        logger.debug("Number of interesting nouns : %s", counter_article)

        if counter_article > 2:
            self._compute_content_score(counter_nouns_article,
                                        related_articles, counter_article,
                                        article)
        else:
            self.delete()
            raise APIException.warning(
                "Notre méthode de calcul n'a pas pu fournir de résultat sur cet article."
            )

        self.scores_version = WebPage.CURRENT_SCORES_VERSION
        self.save()
        logger.info(f"Finished computing scores for article {self.url}")
        return self