Python download_page_source Examples

Programming Language: Python

Namespace/Package Name: crawler_functions

Method/Function: download_page_source

Examples at hotexamples.com: 4

Python download_page_source - 4 examples found. These are the top rated real world Python examples of crawler_functions.download_page_source extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: download_tropes_from_list.py Project: matthiasphysicists/TVTropes_Crawler

def download_from_list(trope_list):
    for trope in trope_list:
        try:
            print('Downloading ' + trope)
            crawler_functions.download_page_source(trope, delay=CRAWL_DELAY, local_file='Tropes/Main/' + trope.replace('/', '_'))
        except:
            print('ERROR! Check page ' + trope + ' for problems')

Example #2

Show file

File: download_media_from_list.py Project: matthiasphysicists/TVTropes_Crawler

def download_from_list(media_list, namespace):
    for title in media_list:
        try:
            print('Downloading ' + title)
            crawler_functions.download_page_source(title, namespace=namespace,
                delay=CRAWL_DELAY,
                local_file='Tropes/' + namespace + '/' + title.replace('/', '_'))
        except:
            print('ERROR! Check page ' + title + ' for problems')

Example #3

Show file

File: crawler_downloader.py Project: Serpens/TVTropes_Crawler

#! python3
import crawler_functions
import string
from time import sleep

# CONSTANTS
CRAWL_PAUSE = 1

# Main Page
page_src = crawler_functions.download_page_source("Tropes")
subindex_list = crawler_functions.get_subindexes_from_index(page_src)  # initial list of subindexes

# Subindexes from Main Page
trope_list = []
checked_subindex_list = []
for subindex in subindex_list:
    print("Current subindex page: " + subindex)
    page_src = crawler_functions.download_page_source(subindex)
    sleep(CRAWL_PAUSE)
    #
    # Subindexes
    current_page_subindex_list = crawler_functions.get_subindexes_from_index(
        page_src
    )  # gets subindexes from current page
    if current_page_subindex_list is None:
        print("IndexError for page: " + subindex)
        exit(1)
    for current_page_subindex in current_page_subindex_list:
        if current_page_subindex not in subindex_list:
            subindex_list.append(current_page_subindex)
    subindex_list.remove(subindex)

Example #4

Show file

File: get_deep_tropes_into_mongo.py Project: matthiasphysicists/TVTropes_Crawler


def find_subpages(media_doc):
    pattern = re.compile(media_doc['name'] + '/\w+')
    return pattern.findall(media_doc.get('source', ''))


if __name__ == '__main__':
    client = pymongo.MongoClient(MONGODB_HOST, MONGODB_PORT)
    db = client.get_database('tvtropes')
    media_collection = db.get_collection('media')

    trope_subpages_dir = os.path.join(os.getcwd(), 'Tropes', 'Subpages')

    for media in media_collection.find():
        subpages = find_subpages(media)
        if len(subpages) > 0:
            media_collection.find_one_and_update({'_id':media['_id']},
                {"$set": {'subpages': subpages}})
            subpage_sources = []
            for page in subpages:
                print(page)
                title, subtitle = page.split('/')
                source = download_page_source(subtitle, namespace=title,
                    delay=1,
                    local_file=os.path.join(trope_subpages_dir, page.replace('/', '_'))
                    )
                subpage_sources.append(source)
            media_collection.find_one_and_update({'_id':media['_id']},
                {"$set": {'subpage_sources': subpage_sources}})