def predict(self, url): # TODO: url results can be cached # Get named entities for requested url test_article = Article(url, '', '', '') # Return an error if no named entities were found if len(test_article.named_entities) == 0: raise NoNamedEntities() # TODO: Add some caching for this # Fetch articles with the same publish date try: articles = Article.load_articles_from_datasets( test_article.published_at.strftime('%Y-%m-%d') ) except SherlockMLDatasetsError: raise NoArticlesFound() # Return an error if no articles were found for the published date if len(articles) == 0: raise NoArticlesFound() # List of named entities named_entities_list = list(map(lambda x: ' '.join(x.named_entities), articles)) named_entities_list.append(' '.join(test_article.named_entities)) # TF-IDF matrix tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(named_entities_list) # Fit KNN nbrs = NearestNeighbors(n_neighbors=20) nbrs.fit(tfidf_matrix) # Predict test_row = tfidf_matrix.getrow(len(named_entities_list) - 1) distances, indices = nbrs.kneighbors(test_row) # Format predictions similar_articles = { 'left': None, 'center': None, 'right': None, } for idx, val in enumerate(indices.flatten()[1:]): if val == len(articles): continue article = articles[val] if article.url == test_article.url or len(article.named_entities) <= 1: continue distance = distances.flatten()[idx] bias = article.bias article_json = { 'distance': distance, 'title': article.title, 'url': article.url, 'description': article.description, 'source_id': article.source_id, 'named_entities': article.named_entities, } if bias == 1: current_left = similar_articles['left'] if current_left is None or distance < current_left['distance']: similar_articles['left'] = article_json elif bias == 0: current_center = similar_articles['center'] if current_center is None or distance < current_center['distance']: similar_articles['center'] = article_json elif bias == -1: current_right = similar_articles['right'] if current_right is None or distance < current_right['distance']: similar_articles['right'] = article_json return similar_articles