Ejemplo n.º 1
0
 def test_get_target_url_does_not_throw_if_empty_if_collection_does_not_exist(
         self):
     # given
     mongo_database = MongoDb()
     mongo_database._database = mongomock.MongoClient()["newspaper"]
     # when
     mongo_database.get_target_urls('de')
Ejemplo n.º 2
0
    def test_MongoDB_get_open_task_returns_none_for_empty_collection(self):
        # given
        mongo_database = MongoDb()
        mongo_database._database = mongomock.MongoClient()["newspaper"]

        # when
        task = mongo_database.get_open_task('tr')

        # then
        assert task is None
Ejemplo n.º 3
0
    def test_MongoDB_get_open_task_returns_none_if_no_task_open(self):
        # given
        mongo_database = MongoDb()
        mongo_database._database = mongomock.MongoClient()["newspaper"]
        mongo_database._database["tr"].insert_one(
            {"text": "This article is scraped"})

        # when
        task = mongo_database.get_open_task("tr")

        # then
        assert task is None
Ejemplo n.º 4
0
    def test_MongoDB_get_open_task_returns_open_task(self):
        # given
        mongo_database = MongoDb()
        mongo_database._database = mongomock.MongoClient()["newspaper"]
        open_task = {"url": "www.opentask.com"}
        open_task['_id'] = mongo_database._database["de"].insert_one(
            open_task).inserted_id

        # when
        task = mongo_database.get_open_task("de")

        # then
        assert task == open_task
Ejemplo n.º 5
0
def url_processor(language, **context):
    database = MongoDb()
    target = database.get_open_task(language)

    if target is None:
        logger.info('No task left')

    else:
        url = target["url"]
        logger.info('Extracting data from {}'.format(url))

        data = extract_data(url)

        logger.info('Upserting data')
        database.insert_article(data, language=language)
Ejemplo n.º 6
0
    def test_insert_into_empty_db(self):
        # given
        mongo_database = MongoDb()
        mongo_database._database = mongomock.MongoClient()["newspaper"]
        expected_article = {
            "title": "How Millennials Are Disrupting Test",
            "url": "www.testytest.com"
        }

        # when
        mongo_database.insert_article(expected_article, "LANGUAGE")

        # then
        actual_article = mongo_database._database["LANGUAGE"].find_one({})
        assert actual_article["title"] == expected_article["title"]
Ejemplo n.º 7
0
    def test_get_target_url(self):
        # given
        mongo_database = MongoDb()
        mongo_database._database = mongomock.MongoClient()["newspaper"]
        target_1 = {'language': 'de', 'url': 'www.news1.de'}
        mongo_database._database["TARGET"].insert_one(target_1)
        target_2 = {'language': 'de', 'url': 'www.news2.de'}
        mongo_database._database["TARGET"].insert_one(target_2)

        # when
        targets = mongo_database.get_target_urls('de')

        # then

        assert len(targets) == 2
        assert 'www.news2.de' in targets
        assert 'www.news1.de' in targets
Ejemplo n.º 8
0
    def test_insert_does_not_create_duplicate(self):
        # given
        mongo_database = MongoDb()
        mongo_database._database = mongomock.MongoClient()["newspaper"]
        existing_article = {
            "title": "How Millennials Are Disrupting Test",
            "url": "www.testytest.com"
        }
        existing_article["_id"] = mongo_database._database["newspaper"][
            "LANGUAGE"].insert_one(existing_article).inserted_id

        # when
        mongo_database.insert_article(existing_article, "LANGUAGE")

        # then
        article_count = mongo_database._database["newspaper"][
            "LANGUAGE"].count_documents({})
        assert article_count == 1
def url_scraper(language, **context):
    database = MongoDb()

    newspaper_url = database.get_target_urls(language)

    for url in newspaper_url:
        logger.info('Generating TODOs for {}'.format(url))
        paper = newspaper.build(url,
                                language=language,
                                memoize_articles=False,
                                fetch_images=False,
                                MIN_WORD_COUNT=100)

        logger.info('Creating tasks for {}'.format(url))

        raw_urls = [article.url for article in paper.articles]
        cleaned_urls = get_clean_urls(raw_urls)
        tasks = [{'url': cleaned_url, 'origin': url} for cleaned_url in cleaned_urls]

        logger.info('Inserting tasks for {}'.format(url))
        database.insert_tasks(tasks, language)
Ejemplo n.º 10
0
    def test_insert_data_overwrites_task_entry(self):
        # given
        mongo_database = MongoDb()
        mongo_database._database = mongomock.MongoClient()["newspaper"]

        mongo_database._database["tr"].insert_one({'url': 'www.bike.com'})
        # when

        mongo_database.insert_article(
            {
                'url': 'www.bike.com',
                'text': 'This article is scraped'
            }, "tr")

        # then
        total = [
            i for i in mongo_database._database["tr"].find(
                {'url': 'www.bike.com'})
        ]

        assert total[0]["text"] == "This article is scraped"
        assert len(total) == 1
Ejemplo n.º 11
0
    def test_insert_tasks_does_not_reinsert_solved_task(self):
        # given
        mongo_database = MongoDb()
        mongo_database._database = mongomock.MongoClient()["newspaper"]

        mongo_database._database["tr"].insert_one({
            'url':
            'www.bike.com',
            'text':
            "this article is scraped"
        })
        # when
        mongo_database.insert_tasks([{'url': 'www.bike.com'}], "tr")

        # then
        total = [
            i for i in mongo_database._database["tr"].find(
                {'url': 'www.bike.com'})
        ]

        assert total[0]["text"] == "this article is scraped"
        assert len(total) == 1