Exemple #1
0
    def predict(self, url):
        # TODO: url results can be cached
        # Get named entities for requested url
        test_article = Article(url, '', '', '')

        # Return an error if no named entities were found
        if len(test_article.named_entities) == 0:
            raise NoNamedEntities()

        # TODO: Add some caching for this
        # Fetch articles with the same publish date
        try:
            articles = Article.load_articles_from_datasets(
                test_article.published_at.strftime('%Y-%m-%d')
            )
        except SherlockMLDatasetsError:
            raise NoArticlesFound()

        # Return an error if no articles were found for the published date
        if len(articles) == 0:
            raise NoArticlesFound()

        # List of named entities
        named_entities_list = list(map(lambda x: ' '.join(x.named_entities), articles))
        named_entities_list.append(' '.join(test_article.named_entities))

        # TF-IDF matrix
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(named_entities_list)

        # Fit KNN
        nbrs = NearestNeighbors(n_neighbors=20)
        nbrs.fit(tfidf_matrix)

        # Predict
        test_row = tfidf_matrix.getrow(len(named_entities_list) - 1)
        distances, indices = nbrs.kneighbors(test_row)

        # Format predictions
        similar_articles = {
            'left': None,
            'center': None,
            'right': None,
        }
        for idx, val in enumerate(indices.flatten()[1:]):
            if val == len(articles):
                continue

            article = articles[val]

            if article.url == test_article.url or len(article.named_entities) <= 1:
                continue

            distance = distances.flatten()[idx]
            bias = article.bias
            article_json = {
                'distance': distance,
                'title': article.title,
                'url': article.url,
                'description': article.description,
                'source_id': article.source_id,
                'named_entities': article.named_entities,
            }

            if bias == 1:
                current_left = similar_articles['left']
                if current_left is None or distance < current_left['distance']:
                    similar_articles['left'] = article_json
            elif bias == 0:
                current_center = similar_articles['center']
                if current_center is None or distance < current_center['distance']:
                    similar_articles['center'] = article_json
            elif bias == -1:
                current_right = similar_articles['right']
                if current_right is None or distance < current_right['distance']:
                    similar_articles['right'] = article_json

        return similar_articles