def add_searches(title, url, new_article, session):
    """
    This method takes the relevant keywords from the title
    and URL, and tries to properly clean them.
    It finally adds the ArticleWord to the session, to be committed as a whole.
    :param title: The title of the article
    :param url: The url of the article
    :param new_article: The actual new article
    :param session: The session to which it should be added.
    """

    # Split the title, path and url netloc (sub domain)
    all_words = title.split()
    from urllib.parse import urlparse

    # Parse the URL so we can call netloc and path without a lot of regex
    parsed_url = urlparse(url)
    all_words += re.split('; |, |\*|-|%20|/', parsed_url.path)
    all_words += parsed_url.netloc.split('.')[0]

    for word in all_words:
        # Strip the unwanted characters
        word = strip_article_title_word(word)
        # Check if the word is of proper length, not only digits and not empty or www
        if word in ['www', '', ' '
                    ] or word.isdigit() or len(word) < 3 or len(word) > 25:
            continue
        else:
            # Find or create the ArticleWord and add it to the session
            article_word_obj = ArticleWord.find_by_word(word)
            if article_word_obj is None:
                article_word_obj = ArticleWord(word)
            article_word_obj.add_article(new_article)
            session.add(article_word_obj)
def _get_articles_for_search_term(search_term):
    search_terms = search_term.lower().split()

    individual_term_results = []

    for each in search_terms:
        individual_term_results.append(set(ArticleWord.get_articles_for_word(each)))

    return individual_term_results[0].intersection(*individual_term_results[1:])
Beispiel #3
0
    def testDownloadWithWords(self):
        feed = RSSFeedRule().feed1

        download_from_feed(feed, zeeguu_core.db.session, 3)

        article = feed.get_articles(limit=2)[0]

        # Try two words, as one might be filtered out
        word = strip_article_title_word(article.title.split()[0])
        article_word = ArticleWord.find_by_word(word)

        if word in ['www', ''
                    ] or word.isdigit() or len(word) < 3 or len(word) > 25:
            assert (article_word is None)
        else:
            assert (article in article_word.articles)
def _filter_subscribed_articles(
    search_subscriptions, topic_subscriptions, user_languages, user
):
    """
    :param subscribed_articles:
    :param user_filters:
    :param user_languages:
    :param user_search_filters:
    :return:

            a generator which retrieves articles as needed

    """

    from zeeguu_core.model import Topic

    user_search_filters = SearchFilter.all_for_user(user)

    # TODO: shouldn't this be passed down from upstream?
    total_article_count = 30
    per_language_article_count = total_article_count / len(user_languages)

    final_article_mix = set()
    for language in user_languages:
        print(f"language: {language}")

        query = Article.query
        query = query.order_by(Article.id.desc())
        query = query.filter(Article.language == language)
        query = query.filter(Article.broken == False)

        # speed up a bit the stuff
        # query = query.filter(Article.id > 500000)

        # 0. Ensure appropriate difficulty
        declared_level_min, declared_level_max = user.levels_for(language)
        lower_bounds = declared_level_min * 10
        upper_bounds = declared_level_max * 10

        query = query.filter(lower_bounds < Article.fk_difficulty)
        query = query.filter(Article.fk_difficulty < upper_bounds)

        # 1. Keywords to exclude
        # ==============================
        keywords_to_avoid = []
        for user_search_filter in user_search_filters:
            keywords_to_avoid.append(user_search_filter.search.keywords)
        print(f"keywords to exclude: {keywords_to_avoid}")

        for keyword_to_avoid in keywords_to_avoid:
            query = query.filter(
                not_(
                    or_(
                        Article.title.contains(keyword_to_avoid),
                        Article.content.contains(keyword_to_avoid),
                    )
                )
            )  # title does not contain keywords

        # 2. Topics to exclude / filter out
        # =================================
        user_filters = TopicFilter.all_for_user(user)
        to_exclude_topic_ids = [each.topic.id for each in user_filters]
        print(f"to exlcude topic ids: {to_exclude_topic_ids}")
        print(f"topics to exclude: {user_filters}")
        query = query.filter(
            not_(Article.topics.any(Topic.id.in_(to_exclude_topic_ids)))
        )

        # 3. Topics subscribed, and thus to include
        # =========================================
        ids_of_topics_to_include = [
            subscription.topic.id for subscription in topic_subscriptions
        ]
        # print(f"topics to include: {topic_subscriptions}")
        print(f"topics ids to include: {ids_of_topics_to_include}")
        # we comment out this line, because we want to do an or_between it and the
        # one corresponding to searches later below!
        # query = query.filter(Article.topics.any(Topic.id.in_(topic_ids)))

        # 4. Searches to include
        # ======================
        print(f"Search subscriptions: {search_subscriptions}")
        ids_for_articles_containing_search_terms = set()
        for user_search in search_subscriptions:
            search_string = user_search.search.keywords.lower()

            articles_for_word = ArticleWord.get_articles_for_word(search_string)

            ids_for_articles_containing_search_terms.update(
                [article.id for article in articles_for_word]
            )

        # commenting out this line, in favor of it being part of a merge later
        # query = query.filter(Article.id.in_(article_ids))

        if ids_of_topics_to_include or ids_for_articles_containing_search_terms:
            query = query.filter(
                or_(
                    Article.topics.any(Topic.id.in_(ids_of_topics_to_include)),
                    Article.id.in_(ids_for_articles_containing_search_terms),
                )
            )

        query = query.limit(per_language_article_count)
        final_article_mix.update(query.all())

    return final_article_mix