-
Notifications
You must be signed in to change notification settings - Fork 0
/
coronaDataFetcher.py
144 lines (105 loc) · 3.63 KB
/
coronaDataFetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from Utilities.Logger import print_flush, reset_log_info
from Utilities.coronaDB import MyCoronaDB
from seleniums.CNNSelenium import CNNSeleniumScraper
from seleniums.arcgisDashboardSelenium import ArcgisDashSeleniumScraper
from seleniums.calcalistSelenium import CalcalistSeleniumScraper
from seleniums.haaretzSelenium import HaaretzSeleniumScraper
from seleniums.ynetSelenium import YnetSeleniumScraper
from spiders.AlJazeeraSpider import AlJazeeraSpider
from spiders.BbcSpider import BbcSpider
from spiders.ClalitSpider import ClalitSpider
from spiders.MakoSpider import MakoSpider
from spiders.StatistaSpider import StatistaSpider
from spiders.WallaSpider import WallaSpider
from spiders.WorldmetersSpider import WorldometersSpider
from spiders.WikipediaSpider import WikipediaSpider
from spiders.ynetSpider import YnetSpider
from twisted.internet.task import LoopingCall
from twisted.internet import reactor
from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerRunner
from multiprocessing import Process
import datetime
from time import sleep
def crawl_selenium_scrapers():
s = ArcgisDashSeleniumScraper()
s.scrape()
s = CalcalistSeleniumScraper()
s.scrape()
s = CNNSeleniumScraper()
s.scrape()
s = HaaretzSeleniumScraper()
s.scrape()
# s = YnetSeleniumScraper()
# s.scrape()
def run_seleniums_once():
cur_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
print_flush(f"Let's start scraping SELENIUM! {cur_time}")
crawl_selenium_scrapers()
db.print_db()
cur_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
print_flush(f"FINISHED scraping SELENIUM! {cur_time}")
def run_seleniums_in_loops():
while True:
run_seleniums_once()
# print_flush(f"Finished! {current_time}")
sleep(60 * 10)
def crawl_spiders(runner):
cur_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
print_flush(f"Let's start scraping SPIDERS! {cur_time}")
runner.crawl(AlJazeeraSpider)
runner.crawl(BbcSpider)
runner.crawl(ClalitSpider)
runner.crawl(MakoSpider)
runner.crawl(StatistaSpider)
runner.crawl(WallaSpider)
runner.crawl(WikipediaSpider)
runner.crawl(WorldometersSpider)
runner.crawl(YnetSpider)
db.print_db()
cur_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
print_flush(f"FINISHED scraping SPIDERS! {cur_time}")
def crawl_spiders_once():
configure_logging()
runner = CrawlerRunner()
crawl_spiders(runner)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
def one_time_scrape():
crawl_spiders_once()
run_seleniums_once()
def scrape_all_for_loop(runner):
crawl_spiders(runner)
run_seleniums_once()
def crawl_spiders_in_loop():
configure_logging()
runner = CrawlerRunner()
task = LoopingCall(lambda: crawl_spiders(runner))
task.start(60 * 10)
reactor.run()
def loop_scraper():
if __name__ == '__main__':
p1 = Process(target=run_seleniums_in_loops)
p1.start()
p2 = Process(target=crawl_spiders_in_loop)
p2.start()
p1.join()
p2.join()
def scrape_loop_together():
configure_logging()
runner = CrawlerRunner()
task = LoopingCall(lambda: scrape_all_for_loop(runner))
task.start(60 * 20)
reactor.run()
print_flush("Welcome to the Corona Data Fetcher for Israel's sick counter")
# db.print_db()
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
print_flush(f"Let's start scraping! {current_time}")
db = MyCoronaDB()
reset_log_info()
scrape_loop_together()
# loop_scraper()
# one_time_scrape()
db.print_db()
db.close()