forked from matthiaskozubal/TVTropes_Crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler_downloader.py
50 lines (41 loc) · 1.73 KB
/
crawler_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#! python3
import crawler_functions
import string
from time import sleep
# CONSTANTS
CRAWL_PAUSE = 1
# Main Page
page_src = crawler_functions.download_page_source("Tropes")
subindex_list = crawler_functions.get_subindexes_from_index(page_src) # initial list of subindexes
# Subindexes from Main Page
trope_list = []
checked_subindex_list = []
for subindex in subindex_list:
print('Current subindex page: ' + subindex)
page_src = crawler_functions.download_page_source(subindex)
sleep(CRAWL_PAUSE)
#
# Subindexes
current_page_subindex_list = crawler_functions.get_subindexes_from_index(page_src) # gets subindexes from current page
if current_page_subindex_list is None:
print('IndexError for page: ' + subindex)
exit(1)
for current_page_subindex in current_page_subindex_list:
if current_page_subindex not in subindex_list:
subindex_list.append(current_page_subindex)
subindex_list.remove(subindex)
checked_subindex_list.append(subindex)
#
# Tropes
trope_list.extend(crawler_functions.get_tropes_from_page(page_src)) # gets tropes
# TODO: some subindexes (e.g. Opera) are already Tropes, we need function to check if a page is already a Trope
#
# Write to file list of checked subindexes
with open('checked_subindex_list.txt', 'w') as output_checked_subindex_list:
output_checked_subindex_list.write('\n'.join(sorted(checked_subindex_list)))
# Write to file list of checked subindexes
with open('trope_list.txt', 'w') as output_trope_list:
output_trope_list.write('\n'.join(sorted(trope_list)))
# visual check
# print('\n'.join(sorted(trope_list[0:9])))
# print('\n'.join(sorted(checked_subindex_list)[0:9])))