def __init__(self): """Initialize the index database connection.""" self._db = ArticleIndexDb(DataAccessMode.READ_WRITE) # Track various info for each found lexical item written to the index # for use when updating the article index first page cache on builder # close. # Maps a found lexical item base form to the tracked info for that # lexical item. self._indexed_fli_info_map: Dict[str, _IndexedLexicalItemInfo] = {}
def main() -> None: """Check if any articles in the crawl db are no longer reachable.""" source_name, last_checked_id = parse_script_args() with ArticleIndexDb(DataAccessMode.READ_UPDATE) as db: query: Dict[str, Any] = {'source_name': source_name} if last_checked_id: query['_id'] = {'$gt': ObjectId(last_checked_id)} cursor = db.article_collection.find(query, {'source_url': 1}) cursor.sort('_id', pymongo.ASCENDING) removed_count = 0 checker = ArticleRemovedChecker() for i, doc in enumerate(cursor): if i % 100 == 0: _log.info('Checked %s\tRemoved %s', i, removed_count) if checker.check_if_404(doc['source_url']): removed_count += 1 result = db.article_collection.update_one( {'_id': doc['_id']}, {'$set': { 'page_removed': True }}) _log.debug('Updated article with _id "%s" as removed: %s', doc['_id'], result.raw_result) else: _log.debug('Article with _id "%s" has not been removed', doc['_id'])
def rescore_article_index() -> None: """Rescore all articles needing rescoring in the article index. See the module docstring for more info on why article rescoring is necesary. Updates the article index database and its first page cache to reflect the new quality scores of the rescored articles. """ current_rescore_datetime = datetime.utcnow() with ArticleIndexDb(DataAccessMode.READ_UPDATE) as db: base_form_article_key_map = _rescore_article_index_database(db) _update_first_page_cache(base_form_article_key_map) _update_last_rescore_datetime(db, current_rescore_datetime)
def __init__(self): """Initialize the index database and cache connections.""" self._db = ArticleIndexDb(DataAccessMode.READ) self._first_page_cache = FirstPageCache() self._next_page_cache = NextPageCache()
class ArticleIndexSearcher(object): """Interface object for searching the Myaku article index.""" def __init__(self): """Initialize the index database and cache connections.""" self._db = ArticleIndexDb(DataAccessMode.READ) self._first_page_cache = FirstPageCache() self._next_page_cache = NextPageCache() def close(self) -> None: """Close the index database connection.""" self._db.close() def __enter__(self) -> 'ArticleIndexSearcher': """Return self on context enter.""" return self def __exit__(self, exc_type, exc_value, exc_traceback) -> None: """Invoke close() method of self on context exit.""" self.close() def _get_query_article_count(self, query: Query) -> int: """Get the total number of articles in the db matching the query. Does not consider the page number of the query when counting the number of matching articles in the database. """ query_field = self._db.QUERY_TYPE_QUERY_FIELD_MAP[query.query_type] cursor = self._db.found_lexical_item_collection.aggregate([ { '$match': { query_field: query.query_str } }, { '$group': { '_id': '$article_oid' } }, { '$count': 'total' }, ]) docs = list(cursor) return docs[0]['total'] if len(docs) > 0 else 0 def _read_articles( self, object_ids: List[ObjectId]) -> Dict[ObjectId, JpnArticle]: """Read the articles for the given ObjectIds from the database. Args: object_ids: ObjectIds for articles to read from the database. Returns: A mapping from the given ObjectIds to the article stored in the database for that ObjectId. """ article_docs = self._db.read_with_log('_id', object_ids, self._db.article_collection) blog_oids = list( set(doc['blog_oid'] for doc in article_docs if doc['blog_oid'])) if len(blog_oids) > 0: blog_docs = self._db.read_with_log('_id', blog_oids, self._db.blog_collection) oid_blog_map = convert_docs_to_blogs(blog_docs) else: oid_blog_map = {} oid_article_map = convert_docs_to_articles(article_docs, oid_blog_map) return oid_article_map def _get_article_docs_from_search_results( self, search_results_cursor: Cursor, quality_score_field: str, results_start_index: int, max_results_to_return: int) -> List[Document]: """Merge the top search result docs together to get one per article. Args: search_results_cursor: Cursor that will yield search result docs in ranked order. quality_score_field: Name of field in the docs yielded from the search_results_cursor that has the quality score for the search result. results_start_index: Index of the search results to start getting article docs from. Indexing starts at 0. max_results_to_return: Max number of search result article docs to get. If the cursor reaches the last search result before reaching this max, will return all of the search result article docs that could be got. Returns: A list of ranked search results docs with only one per article. """ article_search_result_docs: List[Document] = [] last_article_oid = None skipped_articles = 0 for doc in search_results_cursor: if doc['article_oid'] != last_article_oid: last_article_oid = doc['article_oid'] if skipped_articles != results_start_index: skipped_articles += 1 continue if len(article_search_result_docs) == max_results_to_return: break article_search_result_docs.append({ 'article_oid': doc['article_oid'], 'matched_base_forms': [doc['base_form']], 'found_positions': doc['found_positions'], 'quality_score': doc[quality_score_field], }) elif skipped_articles == results_start_index: article_search_result_docs[-1]['matched_base_forms'].append( doc['base_form']) article_search_result_docs[-1]['found_positions'].extend( doc['found_positions']) return article_search_result_docs @utils.skip_method_debug_logging def _get_from_first_page_cache(self, query: Query) -> Optional[SearchResultPage]: """Get first page of search results for query from first page cache. Args: query: Query to get the first page of search results for from the cache. Returns: The first page of search results for the query if the page was in the first page cache, or None of the first page of search results for the query was not in the first page cache. """ _log.debug('Checking first page cache for query "%s"', query) cached_first_page = self._first_page_cache.get(query) if cached_first_page: _log.info('Page for query "%s" retrieved from first page cache', query) return cached_first_page _log.debug('Query "%s" search results not found in first page cache', query) return None @utils.skip_method_debug_logging def _get_from_next_page_cache(self, query: Query) -> Optional[SearchResultPage]: """Get page of search results from the next page cache. Args: query: Query to get a page of search results for from the cache. Returns: The page of search results for the query if the page for it was in the next page cache, or None if the page for the query was not in the next page cache. """ _log.debug('Checking next page cache for query "%s"', query) cached_next_page = self._next_page_cache.get(query) if cached_next_page: _log.info('Page for query "%s" retrieved from next page cache', query) return cached_next_page _log.debug('Query "%s" search results not found in next page cache', query) return None @utils.skip_method_debug_logging def search_articles_using_db(self, query: Query) -> SearchResultPage: """Search for articles that match the query using only the index db. Does not use the search result caches in any case. The search results are in ranked order by quality score. See the scorer module for more info on how quality scores are determined. Args: query: Query to get a page of search results for from the db. Returns: The queried page of search results. """ query_field = self._db.QUERY_TYPE_QUERY_FIELD_MAP[query.query_type] score_field = self._db.QUERY_TYPE_SCORE_FIELD_MAP[query.query_type] cursor = self._db.found_lexical_item_collection.find( {query_field: query.query_str}) cursor.sort([ (score_field, pymongo.DESCENDING), ('article_last_updated_datetime', pymongo.DESCENDING), ('article_oid', pymongo.DESCENDING), ]) search_result_docs = self._get_article_docs_from_search_results( cursor, score_field, (query.page_num - 1) * SEARCH_RESULTS_PAGE_SIZE, SEARCH_RESULTS_PAGE_SIZE) article_oids = [doc['article_oid'] for doc in search_result_docs] oid_article_map = self._read_articles(article_oids) search_results = convert_docs_to_search_results( search_result_docs, oid_article_map) return SearchResultPage( query=query, total_results=self._get_query_article_count(query), search_results=search_results) def search_articles(self, query: Query) -> SearchResultPage: """Search the index for articles that match the lexical item query. Uses cached search results if possible. The search results are in ranked order by quality score. See the scorer module for more info on how quality scores are determined. Args: query: Query to get a page of search results for from the db. Returns: The queried page of search results. """ if query.page_num == 1: cached_first_page = self._get_from_first_page_cache(query) if cached_first_page: return cached_first_page else: cached_next_page = self._get_from_next_page_cache(query) if cached_next_page: return cached_next_page _log.info( 'Query "%s" search results will be retrieved from the crawl ' 'database', query) return self.search_articles_using_db(query)
def main() -> None: """Build the full search result first page cache.""" utils.toggle_myaku_package_log(filename_base='build_cache') with ArticleIndexDb() as db, ArticleIndexSearcher() as searcher: build_cache(db, searcher)
class ArticleIndexBuilder(object): """Builder for the Myaku article index.""" MAX_ALLOWED_ARTICLE_LEN = 2**16 # 65,536 def __init__(self): """Initialize the index database connection.""" self._db = ArticleIndexDb(DataAccessMode.READ_WRITE) # Track various info for each found lexical item written to the index # for use when updating the article index first page cache on builder # close. # Maps a found lexical item base form to the tracked info for that # lexical item. self._indexed_fli_info_map: Dict[str, _IndexedLexicalItemInfo] = {} def close(self) -> None: """Close the index database connection.""" try: self._update_first_page_cache() finally: self._db.close() def __enter__(self) -> 'ArticleIndexBuilder': """Return self on context enter.""" return self def __exit__(self, exc_type, exc_value, exc_traceback) -> None: """Invoke close() method of self on context exit.""" self.close() # Debug level logging can be extremely noisy (can be over 1gb) when enabled # during this function, so switch to info level if logging. @utils.set_package_log_level(logging.INFO) def _update_first_page_cache(self) -> None: """Update the first page cache to reflect the newly indexed articles. Uses the info tracked in the self._indexed_fli_info_map over the lifetime of this builder in order to make the cache updates in place instead of recaching from the database if possible. """ _log.info('Beginning first page cache update...') first_page_cache = FirstPageCache() update_count = 0 fli_info_map = self._indexed_fli_info_map with ArticleIndexSearcher() as searcher: for i, (base_form, fli_info) in enumerate(fli_info_map.items()): if i % 1000 == 0: _log.info(f'Updated {i:,} / {len(fli_info_map):,} keys') query_to_update = Query(base_form, 1) needs_recache = first_page_cache.is_recache_required( query_to_update, fli_info.best_article_rank_key) if needs_recache: search_result_page = searcher.search_articles_using_db( query_to_update) first_page_cache.set(search_result_page) update_count += 1 else: first_page_cache.increment_total_result_count( query_to_update, fli_info.new_article_count) _log.info( f'Completed first page cache update with {update_count:,} ' f'keys needing recaching and {len(fli_info_map) - update_count:,} ' f'keys updated in place') def _is_article_text_stored(self, article: JpnArticle) -> bool: """Return True if an article with the same text is already stored.""" docs = self._db.read_with_log('text_hash', article.text_hash, self._db.article_collection, { 'text_hash': 1, '_id': 0 }) return len(docs) > 0 def can_store_article(self, article: JpnArticle) -> bool: """Return True if the article is safe to store in the db. Checks that: 1. The article is not too long. 2. There is not an article with the exact same text already stored in the db. """ if self._is_article_text_stored(article): _log.info('Article %s already stored!', article) return False if len(article.full_text) > self.MAX_ALLOWED_ARTICLE_LEN: _log.info('Article %s is too long to store (%s chars)', article, len(article.full_text)) return False return True def _get_fli_safe_articles( self, flis: List[FoundJpnLexicalItem]) -> List[JpnArticle]: """Get the unique articles referenced by the found lexical items. Does NOT include any articles in the returned list that cannot be safely stored in the index db (due to being too long, etc.). """ # Many found lexical items can point to the same article object in # memory, so dedupe using id() to get each article object only once. article_id_map = {id(item.article): item.article for item in flis} articles = list(article_id_map.values()) return [a for a in articles if self.can_store_article(a)] def _get_article_blogs(self, articles: List[JpnArticle]) -> List[JpnArticleBlog]: """Get the unique blogs referenced by the articles.""" articles_with_blog = [a for a in articles if a.blog] # Many found lexical items can point to the same blog object in # memory, so dedupe using id() to get each blog object only once. blog_id_map = {id(a.blog): a.blog for a in articles_with_blog} return list(blog_id_map.values()) def _write_blogs(self, blogs: List[JpnArticleBlog]) -> Dict[int, ObjectId]: """Write the blogs to the database. Args: blogs: Blogs to write to the database. Returns: A mapping from the id() for each given blog to the ObjectId that blog was written with. """ blog_docs = convert_blogs_to_docs(blogs) object_ids = self._db.replace_write_with_log(blog_docs, self._db.blog_collection, 'source_url') blog_oid_map = {id(b): oid for b, oid in zip(blogs, object_ids)} return blog_oid_map def _read_article_oids(self, articles: List[JpnArticle]) -> Dict[int, ObjectId]: """Read the ObjectIds for the articles from the database. Args: articles: Articles to read from the database. Returns: A mapping from the id() for each given article to the ObjectId that article is stored with. """ source_urls = [a.source_url for a in articles] docs = self._db.read_with_log('source_url', source_urls, self._db.article_collection, {'source_url': 1}) source_url_oid_map = {d['source_url']: d['_id'] for d in docs} article_oid_map = { id(a): source_url_oid_map[a.source_url] for a in articles } return article_oid_map def _write_articles(self, articles: List[JpnArticle]) -> Dict[int, ObjectId]: """Write the articles to the database. Args: articles: Articles to write to the database. Returns: A mapping from the id() for each given article to the ObjectId that article was written with. """ blogs = self._get_article_blogs(articles) blog_oid_map = self._write_blogs(blogs) article_docs = convert_articles_to_docs(articles, blog_oid_map) result = self._db.write_with_log(article_docs, self._db.article_collection) article_oid_map = { id(a): oid for a, oid in zip(articles, result.inserted_ids) } return article_oid_map def _update_tracked_fli_info( self, found_lexical_items: List[FoundJpnLexicalItem]) -> None: """Update the tracked info for the given found lexical items. Various information is tracked for the found lexical items indexed by this builder in order to efficiently update found lexical item entries in the first page cache for the article index when the builder is closed. """ for fli in found_lexical_items: rank_key = ArticleRankKey( fli.article.quality_score + fli.quality_score_mod, fli.article.last_updated_datetime, fli.article.database_id, ) info = self._indexed_fli_info_map.get(fli.base_form) if info is None: info = _IndexedLexicalItemInfo(fli.base_form, 0, rank_key) info.new_article_count += 1 if rank_key > info.best_article_rank_key: info.best_article_rank_key = rank_key self._indexed_fli_info_map[fli.base_form] = info def write_found_lexical_items( self, found_lexical_items: List[FoundJpnLexicalItem], write_articles: bool = True) -> bool: """Write found lexical items and their articles to the index database. Args: found_lexical_items: List of found lexical items to write to the database. write_articles: If True, will write all of the articles referenced by the given found lexical items to the database as well. If False, will assume the articles referenced by the given found lexical items are already in the database. Returns: True if all of the given found lexical items were written to the db, or False if some or all of the given found lexical items were not written to the db because their articles were not safe to store. See the can_store_article method docstring for the reasons why an article could be considered unsafe. """ safe_articles = self._get_fli_safe_articles(found_lexical_items) if write_articles: safe_article_oid_map = self._write_articles(safe_articles) else: safe_article_oid_map = self._read_article_oids(safe_articles) # Don't write found lexical items to the db unless their article is # safe to store. safe_article_flis = [] for fli in found_lexical_items: if id(fli.article) in safe_article_oid_map: safe_article_flis.append(fli) found_lexical_item_docs = convert_found_lexical_items_to_docs( safe_article_flis, safe_article_oid_map) self._db.write_with_log(found_lexical_item_docs, self._db.found_lexical_item_collection) self._update_tracked_fli_info(safe_article_flis) return len(safe_article_flis) == len(found_lexical_items)
def __init__(self): """Initialize the Myaku index database connection.""" self._db = ArticleIndexDb(DataAccessMode.READ_UPDATE)
class CrawlTracker(object): """Tracker for items crawled by Myaku crawlers.""" def __init__(self): """Initialize the Myaku index database connection.""" self._db = ArticleIndexDb(DataAccessMode.READ_UPDATE) def close(self) -> None: """Close the connection to the Myaku index database.""" self._db.close() def __enter__(self) -> 'CrawlTracker': """Return self on context enter.""" return self def __exit__(self, exc_type, exc_value, exc_traceback) -> None: """Invoke close() method of self on context exit.""" self.close() @utils.skip_method_debug_logging def _get_last_crawled_map( self, crawlable_items: List[Crawlable_co]) -> Dict[str, datetime]: """Get a mapping from Crawlable items to their last crawled datetime. Args: crawlable_items: List of crawlable items to look up the last crawled datetime for in the Myaku index database. Returns: A mapping from the source url of one of the given crawlable items to the last crawled datetime for that item. If the source url of one of the given crawlable items is not in the returned dictionary, it means that it has never been crawled before. """ if len(crawlable_items) == 0: return {} cursor = self._db.crawlable_coll_map[type(crawlable_items[0])].find( {'source_url': { '$in': [i.source_url for i in crawlable_items] }}, { '_id': -1, 'source_url': 1, 'last_crawled_datetime': 1 }) last_crawled_map = { d['source_url']: d['last_crawled_datetime'] for d in cursor } return last_crawled_map @utils.skip_method_debug_logging def _get_skipped_crawlable_urls( self, crawlable_items: List[Crawlable_co]) -> Set[str]: """Get the crawl skipped source urls from the given crawlable items. Args: crawlable_items: List of crawlable items whose source urls to look up in the Myaku index database to check which are marked as having been skipped during crawling. Returns: A set containing the source urls for the given crawlable items that are marked in the database as having been skipped during crawling. """ if len(crawlable_items) == 0: return set() cursor = self._db.crawl_skip_collection.find( {'source_url': { '$in': [i.source_url for i in crawlable_items] }}, { '_id': -1, 'source_url': 1 }) return set(doc['source_url'] for doc in cursor) def filter_crawlable_to_updated( self, crawlable_items: List[Crawlable_co]) -> List[Crawlable_co]: """Return new list with the items updated since last crawled. The new list includes items that have never been crawled as well. """ total_items = len(crawlable_items) _log.debug('Will apply filter to %s crawlable items', total_items) if total_items == 0: return [] last_crawled_map = self._get_last_crawled_map(crawlable_items) crawl_skip_urls = self._get_skipped_crawlable_urls(crawlable_items) updated_items = [] unstored_count = 0 partial_stored_count = 0 updated_count = 0 skipped_count = 0 for item in crawlable_items: item_url = item.source_url if item_url in crawl_skip_urls: skipped_count += 1 elif item_url not in last_crawled_map: unstored_count += 1 updated_items.append(item) elif last_crawled_map[item_url] is None: partial_stored_count += 1 updated_items.append(item) elif (item.last_updated_datetime is not None and item.last_updated_datetime > last_crawled_map[item_url]): updated_count += 1 updated_items.append(item) _log.debug( 'Filtered to %s unstored, %s partially stored, and %s updated ' 'crawlable items of type %s (%s crawl skipped)', unstored_count, partial_stored_count, updated_count, type(crawlable_items[0]), skipped_count) return updated_items def update_last_crawled_datetime(self, item: Crawlable) -> None: """Update the last crawled datetime of the item in the Myaku database. If a crawlable item with the source url of the given item is not found in the database, marks the source url as having been skipped during crawling instead. """ _log.debug( 'Updating the last crawled datetime for item "%s" of type %s', item, type(item)) result = self._db.crawlable_coll_map[type(item)].update_one( {'source_url': item.source_url}, {'$set': { 'last_crawled_datetime': item.last_crawled_datetime }}) _log.debug('Update result: %s', result.raw_result) if result.matched_count == 0: _log.debug( 'Source url "%s" for item was not found in the db, so marking ' 'as crawl skipped', item.source_url) self._db.crawl_skip_collection.insert_one({ 'source_url': item.source_url, 'source_name': item.source_name, 'last_crawled_datetime': item.last_crawled_datetime })