Ejemplo n.º 1
0
def cache_surrounding_pages(query: Query) -> None:
    """Cache the surrounding pages for the query for the user.

    Args:
        query: Query made by a user that should have its next page loaded into
            the next page cache.
    """
    utils.toggle_myaku_package_log(filename_base='web_worker')
    utils.toggle_myaku_package_log(
        filename_base='web_worker', package='search'
    )

    cache_client = NextPageCache()
    current_page_num = query.page_num
    if current_page_num < settings.MAX_SEARCH_RESULT_PAGE:
        query.page_num = current_page_num + 1
        with ArticleIndexSearcher() as searcher:
            forward_page = searcher.search_articles(query)
        cache_client.set(
            query.user_id, forward_page, NextPageDirection.FORWARD
        )

    # Don't cache the backward page unless it's > 1 because the page 1 is
    # always in the first page cache.
    if current_page_num > 2:
        query.page_num = current_page_num - 1
        with ArticleIndexSearcher() as searcher:
            backward_page = searcher.search_articles(query)
        cache_client.set(
            query.user_id, backward_page, NextPageDirection.BACKWARD
        )
Ejemplo n.º 2
0
def main() -> None:
    """Build a shelf for JMdict data."""
    utils.toggle_myaku_package_log(filename_base='build_shelf')

    # Creating a JapanenTextAnalyzer object will automatically create the
    # JMdict shelf if it's not already created.
    JapaneseTextAnalyzer()
Ejemplo n.º 3
0
def main() -> None:
    """Run a most recent crawl for the script arg-specified crawlers."""
    utils.toggle_myaku_package_log(filename_base=LOG_NAME)
    stats = CrawlStats()
    jta = JapaneseTextAnalyzer()
    scorer = MyakuArticleScorer()

    crawler_types = parse_crawler_types_arg()
    for crawler_type in crawler_types:
        crawl_most_recent(crawler_type, jta, scorer, stats)
    stats.finish_stat_tracking()
Ejemplo n.º 4
0
REQUEST_PAGE_NUM_KEY = 'p'

MAX_QUERY_LEN = 120

_ARTICLE_LEN_GROUPS = [
    (700, 'Short length'),
    (1200, 'Medium length'),
    (2000, 'Long length')
]
_ARTICLE_LEN_GROUP_MAX_NAME = 'Very long length'

_VERY_RECENT_DAYS = 7

# Enable logging for both the myaku package and this search package to the same
# files.
utils.toggle_myaku_package_log(filename_base='myakuweb')
utils.toggle_myaku_package_log(filename_base='myakuweb', package='search')


def is_very_recent(dt: datetime) -> bool:
    """Return True if the datetime is considered very recent."""
    days_since_dt = (datetime.utcnow() - dt).days
    return days_since_dt <= _VERY_RECENT_DAYS


def json_serialize_datetime(dt: datetime) -> str:
    """Serialize a naive datetime to a UTC ISO format string."""
    return dt.isoformat(timespec='seconds') + 'Z'


class ResourceLink(NamedTuple):
Ejemplo n.º 5
0
def main() -> None:
    """Build the full search result first page cache."""
    utils.toggle_myaku_package_log(filename_base='build_cache')
    with ArticleIndexDb() as db, ArticleIndexSearcher() as searcher:
        build_cache(db, searcher)
Ejemplo n.º 6
0
        cursor = db.article_collection.find(query, {'source_url': 1})
        cursor.sort('_id', pymongo.ASCENDING)

        removed_count = 0
        checker = ArticleRemovedChecker()
        for i, doc in enumerate(cursor):
            if i % 100 == 0:
                _log.info('Checked %s\tRemoved %s', i, removed_count)

            if checker.check_if_404(doc['source_url']):
                removed_count += 1
                result = db.article_collection.update_one(
                    {'_id': doc['_id']}, {'$set': {
                        'page_removed': True
                    }})
                _log.debug('Updated article with _id "%s" as removed: %s',
                           doc['_id'], result.raw_result)
            else:
                _log.debug('Article with _id "%s" has not been removed',
                           doc['_id'])


if __name__ == '__main__':
    _log = logging.getLogger('myaku.runners.check_for_removed_articles')
    utils.toggle_myaku_package_log(filename_base='check_for_removed_articles')
    try:
        main()
    except BaseException:
        _log.exception('Unhandled exception in main')
        raise
Ejemplo n.º 7
0
def main() -> None:
    """Update the scores of the articles in the crawl db."""
    utils.toggle_myaku_package_log(filename_base=LOG_NAME)
    timer = Timer('rescore')
    rescore_article_index()
    timer.stop()