def from_url(cls, url: str) -> 'WebPage': existing = cls.objects.filter(url=url).first() if existing and existing.content_score is None: raise APIException.info( 'Cet article est en cours de traitement. Merci de réessayer dans quelques minutes.' ) if (existing and existing.scores_version == WebPage.CURRENT_SCORES_VERSION and existing.updated_at > timezone.now() - datetime.timedelta(days=7)): logger.info(f"Returning existing object for url {url}") return existing elif not existing: base_domain = extract_base_domain(url) logger.debug(f"Base domain found {base_domain}") domain, created = BaseDomain.objects.get_or_create( base_domain=base_domain) existing = cls.objects.create( url=url, scores_version=WebPage.CURRENT_SCORES_VERSION, base_domain=domain, total_articles=0) try: return existing.compute_scores() except APIException as e: raise e except Exception as e: existing.delete() raise APIException.error("Erreur lors du calcul du score.", internal_message=str(e))
def _compute_content_score(self, counter_nouns_article: Counter, related_articles: dict, counter_article: int, article: goose3.article.Article) -> None: nb_articles = 0 interesting_articles = 0 scores_new_articles = [] dict_interesting_articles = {} parsed_uri = urlparse(self.url) logger.debug("URL parsed") g = Goose({ 'browser_user_agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0" }) blocked_counter = 0 too_similar_counter = 0 # Look for similar articles' url for link in related_articles['value']: linked_url = link['url'] logger.debug("Found URL: %s", linked_url) if parsed_uri.netloc not in linked_url: try: linked_article = g.extract(url=linked_url) logger.debug("Name of the article: %s", linked_article.title) if "You have been blocked" in linked_article.title: logger.debug( "Article 'You have been blocked' not considered") blocked_counter += 1 elif SequenceMatcher( None, article.cleaned_text, linked_article.cleaned_text).ratio() > 0.3: logger.debug( "Article with content too similar not considered") too_similar_counter += 1 else: new_nouns_article = self.nouns( linked_article.cleaned_text) new_counter_nouns_articles = Counter( self.tokens(new_nouns_article)) shared_items = [ k for k in counter_nouns_article if k in new_counter_nouns_articles and counter_nouns_article[k] > 1 ] score_article = len(shared_items) / counter_article if score_article > 0.4: scores_new_articles.append(score_article) interesting_articles += 1 dict_interesting_articles[linked_url] = ( linked_article.title, score_article) else: logger.debug("Too low score : %s", score_article) nb_articles += 1 logger.debug("Percentage for new articles : %s", scores_new_articles) except (ValueError, LookupError, RequestException) as e: logger.error( f"Found page that can't be processed : {linked_url} with error message {e}" ) # Calcul du score de l'article if nb_articles == 0: self.delete() message = ( "Nous n'avons trouvé que des articles trop similaires au vôtre. " "Il se peut qu'ils proviennent tous de la même source.") if blocked_counter > too_similar_counter: message = "Nous avons trouvé en majorité des articles dont nous n'avons pas pu extraire le contenu." raise APIException.info(message) elif interesting_articles == 0: content_score = 0 else: content_score = ( (int(interesting_articles / nb_articles * 1000) / 10) + min(100.0, (int( (mean(scores_new_articles) * 1.5) * 1000) / 10))) / 2 logger.debug("Article score : {}".format(content_score)) self.content_score = content_score self.total_articles = nb_articles self._store_interesting_related_articles(dict_interesting_articles)
def compute_scores(self) -> 'WebPage': logger.debug("Start compute_scores") # Extract the title and the text of the article g = Goose({ 'browser_user_agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0" }) try: article = g.extract(url=self.url) except InvalidSchema: self.delete() raise APIException.warning("Adresse invalide") except RequestException: self.delete() raise APIException.warning("Le site n'est pas joignable") # article_counter = Counter(self.tokens(article.cleaned_text)) logger.debug("Text of the article : %s", article.cleaned_text) if article.cleaned_text == "": self.delete() raise APIException.warning( "Oups, nous n'avons pas pu extraire le texte de l'article.") nouns_article = self.nouns(article.cleaned_text) counter_nouns_article = Counter(self.tokens(nouns_article)) logger.debug("Nouns in the article : %s", counter_nouns_article) related_articles = get_related_articles(article, 7) only_same_publisher = self.check_same_publisher(related_articles) if not related_articles.get("value") or only_same_publisher is True: logger.debug( "No article found, try with a period of 30 days before publishing." ) related_articles = get_related_articles(article, 30) only_same_publisher = self.check_same_publisher(related_articles) if not related_articles.get( "value") or only_same_publisher is True: isolated, created = IsolatedArticle.objects.get_or_create( url=self.url, base_domain=self.base_domain) self.delete() raise APIException.info( "Cet article semble isolé, nous n'avons trouvé aucun article en lien avec lui. " "Faites attention!") logger.debug("Articles found %s", related_articles) counter_article = 0 for word in counter_nouns_article: if counter_nouns_article[word] > 1: counter_article += 1 logger.debug("Number of interesting nouns : %s", counter_article) if counter_article > 2: self._compute_content_score(counter_nouns_article, related_articles, counter_article, article) else: self.delete() raise APIException.warning( "Notre méthode de calcul n'a pas pu fournir de résultat sur cet article." ) self.scores_version = WebPage.CURRENT_SCORES_VERSION self.save() logger.info(f"Finished computing scores for article {self.url}") return self