Ejemplo n.º 1
0
    def _create_model_object(self):
        random_url = self.faker.uri()
        random_title = self.faker.sentence()

        url = Url.find_or_create(self.db.session, random_url, random_title)

        if self._exists_in_db(url):
            return self._create_model_object()

        return url
Ejemplo n.º 2
0
    def find_or_create(
        cls,
        session,
        user,
        _origin: str,
        _origin_lang: str,
        _translation: str,
        _translation_lang: str,
        _context: str,
        _url: str,
        _url_title: str,
        article_id: int,
    ):
        """
            if the bookmark does not exist, it creates it and returns it
            if it exists, it ** updates the translation** and returns the bookmark object

        :param _origin:
        :param _context:
        :param _url:
        :return:
        """

        origin_lang = Language.find_or_create(_origin_lang)
        translation_lang = Language.find_or_create(_translation_lang)

        origin = UserWord.find_or_create(session, _origin, origin_lang)

        article = Article.query.filter_by(id=article_id).one()

        url = Url.find_or_create(session, article.url.as_string(), _url_title)

        context = Text.find_or_create(session, _context, origin_lang, url, article)

        translation = UserWord.find_or_create(session, _translation, translation_lang)

        now = datetime.now()

        try:
            # try to find this bookmark
            bookmark = Bookmark.find_by_user_word_and_text(user, origin, context)

            # update the translation
            bookmark.translation = translation

        except sqlalchemy.orm.exc.NoResultFound as e:
            bookmark = cls(origin, translation, user, context, now)
        except Exception as e:
            raise e

        session.add(bookmark)
        session.commit()

        return bookmark
Ejemplo n.º 3
0
    def find(cls, url: str):
        """

            Find by url

        :return: object or None if not found
        """

        from zeeguu_core.model import Url
        try:
            url_object = Url.find(url)
            return (cls.query.filter(cls.url == url_object)).one()
        except NoResultFound:
            return None
Ejemplo n.º 4
0
    def test_url_domain(self):
        """Tests the correct retrieval of a domain from a random url

        e.g. 'https://google.com' should be retrieved from
        e.g. 'https://google.com/search'
        """
        url_random = UrlRule().url.as_string()

        url_parts = url_random.split('//', 1)
        domain_should_be = url_parts[0] + '//' + url_parts[1].split('/', 1)[0]

        domain_to_check = Url(url_random, self.faker.word()).domain_name()

        assert domain_to_check == domain_should_be, (domain_should_be +
                                                     " should be " +
                                                     domain_to_check)
Ejemplo n.º 5
0
    def from_url(cls, url: str):
        data = feedparser.parse(url)

        try:
            title = data.feed.title
        except:
            title = ""

        try:
            description = data.feed.subtitle
        except:
            description = None

        try:
            image_url_string = data.feed.image.href
            print(f'Found image url at: {image_url_string}')
        except:
            print('Could not find any image url.')

        feed_url = Url(url, title)

        return RSSFeed(feed_url, title, description)
Ejemplo n.º 6
0
    def test_one_domain_multiple_urls(self):
        """
        Tests that if multiple URLs are added to the database that their
        DomainName is not added to the database more than once
        """
        # Create an 'original' URL, which is saved to the Database
        url_random_obj_origin = UrlRule().url

        # Create a random number of URLs, each with the same DomainName
        random_num = random.randint(0, 10)
        for _ in range(0, random_num):
            url_random_extended = url_random_obj_origin.as_string(
            ) + self.faker.word()
            _ = Url(url_random_extended, self.faker.word())

        domain_for_query = url_random_obj_origin.domain_name()

        try:
            assert DomainName.find(domain_for_query)
        except NoResultFound:
            assert False, "No domains found in database"
        except MultipleResultsFound:
            assert False, "There were multiple DomainNames in the database"
Ejemplo n.º 7
0
    def find_or_create(cls,
                       session,
                       _url: str,
                       language=None,
                       sleep_a_bit=False):
        """

            If not found, download and extract all
            the required info for this article.

        :param url:
        :return:
        """
        from zeeguu_core.model import Url, Article, Language
        import newspaper

        url = Url.extract_canonical_url(_url)

        try:
            found = cls.find(url)
            if found:
                return found

            art = newspaper.Article(url=url)
            art.download()
            art.parse()

            if art.text == '':
                raise Exception("Newspaper got empty article from: " + url)

            if sleep_a_bit:
                import time
                from random import randint
                print("GOT: " + url)
                sleep_time = randint(3, 33)
                print(
                    f"sleeping for {sleep_time}s... so we don't annoy our friendly servers"
                )
                time.sleep(sleep_time)

            if not language:
                if art.meta_lang == '':
                    art.meta_lang = detect(art.text)
                    zeeguu_core.log(f"langdetect: {art.meta_lang} for {url}")
                language = Language.find_or_create(art.meta_lang)

            # Create new article and save it to DB
            url_object = Url.find_or_create(session, url)

            new_article = Article(
                url_object,
                art.title,
                ', '.join(art.authors),
                art.text[
                    0:
                    32000],  # any article longer than this will be truncated...
                art.summary,
                None,
                None,
                language)
            session.add(new_article)

            session.commit()

            return new_article
        except sqlalchemy.exc.IntegrityError or sqlalchemy.exc.DatabaseError:
            for i in range(10):
                try:
                    session.rollback()
                    u = cls.find(url)
                    print("Found article by url after recovering from race")
                    return u
                except:
                    print("Exception of second degree in article..." + str(i))
                    time.sleep(0.3)
                    continue
                break