def download_from_list(trope_list): for trope in trope_list: try: print('Downloading ' + trope) crawler_functions.download_page_source(trope, delay=CRAWL_DELAY, local_file='Tropes/Main/' + trope.replace('/', '_')) except: print('ERROR! Check page ' + trope + ' for problems')
def download_from_list(media_list, namespace): for title in media_list: try: print('Downloading ' + title) crawler_functions.download_page_source(title, namespace=namespace, delay=CRAWL_DELAY, local_file='Tropes/' + namespace + '/' + title.replace('/', '_')) except: print('ERROR! Check page ' + title + ' for problems')
#! python3 import crawler_functions import string from time import sleep # CONSTANTS CRAWL_PAUSE = 1 # Main Page page_src = crawler_functions.download_page_source("Tropes") subindex_list = crawler_functions.get_subindexes_from_index(page_src) # initial list of subindexes # Subindexes from Main Page trope_list = [] checked_subindex_list = [] for subindex in subindex_list: print("Current subindex page: " + subindex) page_src = crawler_functions.download_page_source(subindex) sleep(CRAWL_PAUSE) # # Subindexes current_page_subindex_list = crawler_functions.get_subindexes_from_index( page_src ) # gets subindexes from current page if current_page_subindex_list is None: print("IndexError for page: " + subindex) exit(1) for current_page_subindex in current_page_subindex_list: if current_page_subindex not in subindex_list: subindex_list.append(current_page_subindex) subindex_list.remove(subindex)
def find_subpages(media_doc): pattern = re.compile(media_doc['name'] + '/\w+') return pattern.findall(media_doc.get('source', '')) if __name__ == '__main__': client = pymongo.MongoClient(MONGODB_HOST, MONGODB_PORT) db = client.get_database('tvtropes') media_collection = db.get_collection('media') trope_subpages_dir = os.path.join(os.getcwd(), 'Tropes', 'Subpages') for media in media_collection.find(): subpages = find_subpages(media) if len(subpages) > 0: media_collection.find_one_and_update({'_id':media['_id']}, {"$set": {'subpages': subpages}}) subpage_sources = [] for page in subpages: print(page) title, subtitle = page.split('/') source = download_page_source(subtitle, namespace=title, delay=1, local_file=os.path.join(trope_subpages_dir, page.replace('/', '_')) ) subpage_sources.append(source) media_collection.find_one_and_update({'_id':media['_id']}, {"$set": {'subpage_sources': subpage_sources}})