Exemple #1
0
    def setUp(self) -> None:
        self.clear_env()

        mock_google_photos.reset_mock()
        mock_google_photos.upload_media.reset_mock(side_effect=True)
        mock_twitter.reset_mock(side_effect=True)
        mock_twitter.make_original_image_url.reset_mock(side_effect=True)
        mock_store.reset_mock()
        mock_store.fetch_not_added_tweet_ids.reset_mock(return_value=True)
        mock_store.fetch_all_failed_upload_medias.reset_mock(return_value=True)
        mock_store.insert_tweet_info.reset_mock(side_effect=True)
        mock_store.insert_failed_upload_media.reset_mock(side_effect=True)
        mock_request.reset_mock(side_effect=True)
        mock_request.urlretrieve.reset_mock(side_effect=True)
        mock_makedirs.reset_mock()
        mock_rmtree.reset_mock()
        mock_sleep.reset_mock(side_effect=True)
        mock_crawler_func.reset_mock(side_effect=True, return_value=True)
        mock_crawler_func2.reset_mock(side_effect=True, return_value=True)

        mock_google_photos.return_value = mock_google_photos
        mock_twitter.return_value = mock_twitter
        mock_store.return_value = mock_store

        os.environ['SAVE_MODE'] = 'google'
        self.crawler = Crawler()
Exemple #2
0
 def test_crawler_urlsLevelHost_method_returns_correct_result(self):
     c = Crawler("http://www.elpais.es")
     c.urlsLevelHost(1)
     uno = len(c.urls)
     c.urlsLevelHost(2)
     dos = len(c.urls)
     self.assertEqual(dos > 1, True)
Exemple #3
0
async def run(loop):

    manager = PersistManager(use_index=True)  # enable 'use_index' to use Elasticsearch (Part 3)
    crawler = Crawler(loop=loop, manager=manager)

    await crawler.get_history()  # Retrieve 5 minute history (Part 1)
    await crawler.run_updates()  # Constant updates (Part 2)
Exemple #4
0
 def test_crawler_downloadOneUrlNewspaperThread_method_returns_correct_result(
         self):
     c = Crawler(
         "https://politica.elpais.com/politica/2017/08/29/actualidad/1504006030_167758.html"
     )
     c.downloadOneUrlThread("alienigenaviolanenes.html")
     self.assertEqual(os.path.exists("alienigenaviolanenes.html"), True)
     self.assertEqual(len(c.files), 1)
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("url")
    parser.add_argument("--use-web-api", action='store_true')
    args = parser.parse_args()

    if not args.use_web_api:
        from app.crawler import Crawler
        import asyncio
        import logging

        loop = asyncio.get_event_loop()
        c = Crawler(logging_level=logging.INFO)
        found_domains = loop.run_until_complete(c.crawl(args.url))
        print(c._domains)

    else:
        import requests

        found_domains = requests.post("http://localhost/count_domains",
                                      json={"urls": [args.url]})
        found_domains = list(found_domains.json().values())[0]

    print(f"found {found_domains} domains")
Exemple #6
0
 def test_crawler_downloadInit_method_returns_correct_result(self):
     c = Crawler("http://www.gnu.org")
     self.assertEqual(c.url, "http://www.gnu.org")
     self.assertEqual(
         c.title, "The GNU Operating System and the Free Software Movement")
Exemple #7
0
 def test_crawler_urlsLevel1Host_method_returns_correct_result(self):
     c = Crawler("http://www.elpais.es")
     c.urlsLevel1Host()
     self.assertEqual(len(c.urls) > 1, True)
Exemple #8
0
 def test_crawler_downloadOneUrlThread_method_returns_correct_result(self):
     c = Crawler("http://www.elpais.es")
     c.downloadOneUrlThread("elpais.html")
     self.assertEqual(os.path.exists("elpais.html"), True)
Exemple #9
0
 def test_crawler_downloadOneUrl_method_returns_correct_result(self):
     c = Crawler("http://www.urjc.es")
     c.downloadOneUrl("urjc.html")
     self.assertEqual(os.path.exists("urjc.html"), True)
Exemple #10
0
from flask import Flask
from flask_apscheduler import APScheduler
from config import Config
from app.models import Database
from app.crawler import Crawler

scheduler = APScheduler()
config = Config()
db = Database()
crawler = Crawler()


def create_app():
    app = Flask(__name__)
    app.config.from_object(config)
    scheduler.init_app(app)
    scheduler.start()

    from app.api import api as api_blueprint
    app.register_blueprint(api_blueprint)

    return app
Exemple #11
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Exemple #12
0
from app.crawler import Crawler
from app.db import DB

db = DB()
if __name__ == '__main__':
    crawler = Crawler(db)
    crawler.run()