Example #1
0
def main():
    starttime = datetime.datetime.now()
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    #cache.clear()
    threaded_crawler('http://example.webscraping.com',scrape_callback.seed_url, scrape_callback=scrape_callback)
    endtime = datetime.datetime.now()
    print((endtime - starttime).seconds)
Example #2
0
def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    #cache.clear()
    threaded_crawler(scrape_callback.seed_url,
                     scrape_callback=scrape_callback,
                     cache=cache,
                     max_threads=max_threads,
                     timeout=10)
Example #3
0
def main(max_threads):

    cache = MongoCache()
    # cache.clear()
    threaded_crawler(
        seed_url='http://example.webscraping.com',
        scrape_callback=link_crawler('http://example.webscraping.com'),
        cache=cache,
        max_threads=max_threads,
        timeout=0)
Example #4
0
def main(max_threads):
    # cache = MongoCache(expires=timedelta())
    seed_url = get_thousand_urls(100)
    # print(seed_url)
    threaded_crawler(
        seed_url,
        scrape_callback=None,
        cache=None,
        max_threads=max_threads
    )
Example #5
0
def test(max_threads):
    start_url = 'http://www.alexa.com/topsites/global;0'
    scrape_callback = AlaxeCallback(allow_domains=[start_url])
    cache = MongoCache()
    # start_url = 'http://www.eastday.com'
    # start_url = 'http://www.qq.com'

    threaded_crawler(start_url,
                     link_regex='/topsites/global;',
                     cache=cache,
                     scrape_callback=scrape_callback,
                     max_threads=max_threads,
                     timeout=5)
import requests
from lxml import etree


def scrape_callback(url, html):
    if url.endswith('.xml'):
        # Parse the sitemap XML file
        resp = requests.get(url)
        tree = etree.fromstring(resp.content)
        links = [e[0].text for e in tree]
        return links
    else:
        # Add scraping code here
        pass


if __name__ == "__main__":
    from threaded_crawler import threaded_crawler
    sitemap = 'http://www.gap.com/products/sitemap_index.xml'
    threaded_crawler(sitemap, '[gap.com]*', scraper_callback=scrape_callback)
Example #7
0
def main():
    sitemap = 'http://www.gap.com/products/sitemap_index.xml'
    threaded_crawler(sitemap, scrape_callback=scrape_callback)
Example #8
0
def main(max_threads):
	scrape_callback = AlexaCallback()
	cache = MongoCache()
	threaded_crawler(scrape_callback.seed_url, scrape_callback = scrape_callback, cache = cache, max_threads = max_threads, timeout = 10)
Example #9
0
main_pages = [
    'http://date.jobbole.com/page/0', 'http://date.jobbole.com/page/2',
    'http://date.jobbole.com/page/3', 'http://date.jobbole.com/page/4',
    'http://date.jobbole.com/page/5', 'http://date.jobbole.com/page/6',
    'http://date.jobbole.com/page/7', 'http://date.jobbole.com/page/8',
    'http://date.jobbole.com/page/9', 'http://date.jobbole.com/page/10',
    'http://date.jobbole.com/page/11', 'http://date.jobbole.com/page/12',
    'http://date.jobbole.com/page/13', 'http://date.jobbole.com/page/14',
    'http://date.jobbole.com/page/15', 'http://date.jobbole.com/page/16',
    'http://date.jobbole.com/page/17'
]
if len(main_pages) > 0:
    threaded_crawler(main_pages,
                     r'^(http://date.jobbole.com/)(\d+)/$',
                     max_depth=-1,
                     max_threads=10,
                     img_callback=img_callback,
                     cache=RedisCache(),
                     user_agent="dfdfsfgdf")

import shutil
import os
path = '/home/caicai/scrapebole/chp4/data/img/date.jobbole.com'
dirpath = '/home/caicai/scrapebole/chp4/data/img'
files = os.listdir(path)
for file in files:
    afile = path + '/' + file
    f = os.listdir(afile)
    if len(f) == 1:
        ajpg = f[0]
        os.chdir(afile)
def main(max_threads):
    scrape_callback = AlexaCallback()
    threaded_crawler(scrape_callback.seed_url,
                     scrape_callback=scrape_callback,
                     max_threads=max_threads,
                     timeout=60)