def main(): nflcrawler = Crawler() seeds = [ "http://www.nfl.com/teams/roster?team=STL", "http://www.nfl.com/teams/roster?team=TEN", "http://www.nfl.com/teams/roster?team=WAS", "http://www.nfl.com/teams/roster?team=CAR", "http://www.nfl.com/teams/roster?team=CLE", "http://www.nfl.com/teams/roster?team=JAC", "http://www.nfl.com/teams/roster?team=KC", ] nflcrawler.add_seeds(seeds) rules = { "^(http://www.nfl.com/teams/roster)(\?team=[a-zA-Z]+)$": [ "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/profile)$" ], "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/profile)$": [ "^(http://www.nfl\.com/player/)([a-zA-Z]+/[0-9]+/careerstats)$" ], } nflcrawler.add_rules(rules) nflcrawler.start()
def main(): nfltweetcrawler = Crawler() seeds = ['http://www.tweeting-athletes.com/index.cfm?CatID=2&People=1'] nfltweetcrawler.add_seeds(seeds) rules = {'^(http://www.tweeting-athletes.com/)(index.cfm\?CatID=2&People=1)$': ['^(http://www.tweeting-athletes.com/)(index.cfm\?AthleteID=[0-9]+)$'], '^(http://www.tweeting-athletes.com/)(index.cfm\?AthleteID=[0-9]+)$':['^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$'], '^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$': ['^(http://www.tweeting-athletes.com/index.cfm)(\?CatID=0&AthleteID=[0-9]+&p=[0-9]+)$']} nfltweetcrawler.add_rules(rules) nfltweetcrawler.start()
def startCrawler(request): try: id = request.POST.get('id') source = Source.objects.get(id=id) sourceurl = source.url crawler = Crawler(sourceurl) crawler.start() runingcrawlers.update( {'id':id,'inst':crawler} ) return redirect('dashboard') except ObjectDoesNotExist: return redirect('dashboard')
from crawler.crawler import Crawler mycrawler = Crawler() seeds = ['http://www.baidu.com/'] # list of url mycrawler.add_seeds(seeds) rules = {'^(http://.+baidu\.com)(.+)$': ['^(http://.+baidu\.com)(.+)$']} #your crawling rules: a dictionary type, #key is the regular expressions for url, #value is the list of regular expressions for urls which you want to follow from the url in key. mycrawler.add_rules(rules) mycrawler.start() # start crawling
if __name__ == "__main__": try: import http.client as httplib except ImportError: import httplib # Override the 100 header limit on responses # Otherwise our requests to the washington post will fail. httplib._MAXHEADERS = 1000 starting_urls = [ 'http://thehill.com/', 'http://www.newsweek.com/', 'https://www.washingtonpost.com/', 'https://www.wsj.com/', 'http://thefederalist.com/', 'http://www.cnn.com/', 'http://foxnews.com/' ] urls = [] for s_url in starting_urls: agg_urls = crawl_sitemaps(s_url, max_depth=1) urls.extend(agg_urls) router = PageRouter() router.add_route('.*', save_page) c = Crawler(router, url_stack=[u['location'] for u in urls]) c.max_depth = 1 c.start()
from crawler.crawler import Crawler mycrawler = Crawler() seeds = ['http://www.fdprice.com/'] # list of url mycrawler.add_seeds(seeds) rules = {'^(http://.+fdprice\.com)(.+)$':[ '^(http://.+fdprice\.com)(.+)$' ]} #your crawling rules: a dictionary type, #key is the regular expressions for url, #value is the list of regular expressions for urls which you want to follow from the url in key. mycrawler.add_rules(rules) mycrawler.start() # start crawling
def start_crawler_post_save(sender, instance, created, **kwargs): crawler = Crawler(instance.seed_url) instance.result = crawler.start(instance.depth) instance.status = "COMPLETED" instance.save()
from crawler.crawler import Crawler import os import json url = os.getenv('CRAWLER_TARGET_URL') output_path = os.getenv('CRAWLER_OUTPUT_PATH') tags = json.loads(os.getenv('CRAWLER_TARGET_TAGS', '["a", "img", "script"]')) if not url: raise NameError('CRAWLER_TARGET_URL env var not set') if not output_path: raise NameError('CRAWLER_OUTPUT_PATH env var not set') crawl = Crawler(url, output_path, tags) crawl.start()