def main(): starttime = datetime.datetime.now() scrape_callback = AlexaCallback() cache = MongoCache() #cache.clear() threaded_crawler('http://example.webscraping.com',scrape_callback.seed_url, scrape_callback=scrape_callback) endtime = datetime.datetime.now() print((endtime - starttime).seconds)
def main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() #cache.clear() threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
def main(max_threads): cache = MongoCache() # cache.clear() threaded_crawler( seed_url='http://example.webscraping.com', scrape_callback=link_crawler('http://example.webscraping.com'), cache=cache, max_threads=max_threads, timeout=0)
def main(max_threads): # cache = MongoCache(expires=timedelta()) seed_url = get_thousand_urls(100) # print(seed_url) threaded_crawler( seed_url, scrape_callback=None, cache=None, max_threads=max_threads )
def test(max_threads): start_url = 'http://www.alexa.com/topsites/global;0' scrape_callback = AlaxeCallback(allow_domains=[start_url]) cache = MongoCache() # start_url = 'http://www.eastday.com' # start_url = 'http://www.qq.com' threaded_crawler(start_url, link_regex='/topsites/global;', cache=cache, scrape_callback=scrape_callback, max_threads=max_threads, timeout=5)
import requests from lxml import etree def scrape_callback(url, html): if url.endswith('.xml'): # Parse the sitemap XML file resp = requests.get(url) tree = etree.fromstring(resp.content) links = [e[0].text for e in tree] return links else: # Add scraping code here pass if __name__ == "__main__": from threaded_crawler import threaded_crawler sitemap = 'http://www.gap.com/products/sitemap_index.xml' threaded_crawler(sitemap, '[gap.com]*', scraper_callback=scrape_callback)
def main(): sitemap = 'http://www.gap.com/products/sitemap_index.xml' threaded_crawler(sitemap, scrape_callback=scrape_callback)
def main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() threaded_crawler(scrape_callback.seed_url, scrape_callback = scrape_callback, cache = cache, max_threads = max_threads, timeout = 10)
main_pages = [ 'http://date.jobbole.com/page/0', 'http://date.jobbole.com/page/2', 'http://date.jobbole.com/page/3', 'http://date.jobbole.com/page/4', 'http://date.jobbole.com/page/5', 'http://date.jobbole.com/page/6', 'http://date.jobbole.com/page/7', 'http://date.jobbole.com/page/8', 'http://date.jobbole.com/page/9', 'http://date.jobbole.com/page/10', 'http://date.jobbole.com/page/11', 'http://date.jobbole.com/page/12', 'http://date.jobbole.com/page/13', 'http://date.jobbole.com/page/14', 'http://date.jobbole.com/page/15', 'http://date.jobbole.com/page/16', 'http://date.jobbole.com/page/17' ] if len(main_pages) > 0: threaded_crawler(main_pages, r'^(http://date.jobbole.com/)(\d+)/$', max_depth=-1, max_threads=10, img_callback=img_callback, cache=RedisCache(), user_agent="dfdfsfgdf") import shutil import os path = '/home/caicai/scrapebole/chp4/data/img/date.jobbole.com' dirpath = '/home/caicai/scrapebole/chp4/data/img' files = os.listdir(path) for file in files: afile = path + '/' + file f = os.listdir(afile) if len(f) == 1: ajpg = f[0] os.chdir(afile)
def main(max_threads): scrape_callback = AlexaCallback() threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, max_threads=max_threads, timeout=60)