def __init__(self, seeds): self.urlQueue=UrlQueue() if isinstance(seeds, str): self.urlQueue.addUnvisitedUrl(seeds) elif isinstance(seeds, list): for i in seeds: self.urlQueue.addUnvisitedUrl(i) print yellow("Add seeds {unvisitedUrl} site.".format( unvisitedUrl = \ str(self.urlQueue.unVisited).lstrip('[').rstrip(']')))
def __init__(self, frame, root_url, thread_num, crawl_depth, login_session): self.frame = frame self.root_url = root_url SpiderMain.thread_num = thread_num SpiderMain.wait_thread_num = thread_num SpiderMain.stoped = False self.con = threading.Condition() self.url_queue = UrlQueue(root_url) self.outputer = html_outputer.HtmlOutputer() self.crawl_depth = crawl_depth self.login_session = login_session self.threads = []
json_key = int(value[0].get('index')) result[key] = json_response[json_key] result.update({'status': 'A'}) Videos.objects.filter(id=url.id_indb).update(**result) if __name__ == '__main__': import os os.environ.setdefault("DJANGO_SETTINGS_MODULE", "spiderx.settings") import time from crawler_models import Url import django django.setup() queue = UrlQueue() # queue.put(Url(url='http://www.youku.com', url_tip='youku', url_type='host')) # 235 # queue.put(Url(url='http://www.acfun.tv', url_tip='acfun', url_type='host')) # 129 # queue.put(Url(url='http://tv.sohu.com', url_tip='sohutv', url_type='host')) # 299 # queue.put(Url(url='http://www.bilibili.com', url_tip='bilibili', url_type='host')) # queue.put( # Url(url='http://v.youku.com/v_show/id_XODk0NjM0Nzcy.html', url_tip='youku', url_type='video', # id_indb=257)) queue.put(Url(url='http://tv.sohu.com/20150306/n409440063.shtml', url_tip='sohutv', url_type='video', id_indb=7200)) # for i in range(10): job = Job(0) # queue.put(Url(url='http://tv.sohu.com/20150221/n409106225.shtml', url_tip='sohutv', url_type='video')) start = time.time()
class Crawler: def __init__(self, seeds): self.urlQueue=UrlQueue() if isinstance(seeds, str): self.urlQueue.addUnvisitedUrl(seeds) elif isinstance(seeds, list): for i in seeds: self.urlQueue.addUnvisitedUrl(i) print yellow("Add seeds {unvisitedUrl} site.".format( unvisitedUrl = \ str(self.urlQueue.unVisited).lstrip('[').rstrip(']'))) def process(self): visitUrl = self.urlQueue.unVisitedUrlDeQuence() print yellow("\nProcessing Url: {url}".format(url=visitUrl)) print "Get Hyper Links:" links = self.getHyperLinks(visitUrl) print yellow("\nGet {linkno} new links".format(linkno = len(links))) print yellow("\nAdding url: {url} to VisitedQueue".format(url=visitUrl)) self.urlQueue.addVisitedUrl(visitUrl) print yellow("\nVisited url count: {count}".format(count = \ str(self.urlQueue.getVisitedUrlCount()))) for link in links: print "\nAdding new link {link_name} to UnvisitedUrlQueue."\ .format(link_name = green(link)) self.urlQueue.addUnvisitedUrl(link) print "\nHave %d unvisited links" % len(self.urlQueue.getUnvisitedUrl()) def crawling(self, crawl_count): flag = True if crawl_count is 0: flag = False elif crawl_count >= 0: flag = True else: raise ValueError("Fetal error : crawler_count->{num} number is invalid.".format(num=str(crawler_count))) try: if flag: while not self.urlQueue.unVisitedUrlsEmpty() and self.urlQueue.getVisitedUrlCount() <= crawl_count-1: self.process() else: while not self.urlQueue.unVisitedUrlsEnmpy(): self.process() except Exception, e: print "process error:", e traceback.print_exc() sys.exit(1) ##print self.urlQueue.getUnvisitedUrl() print self.urlQueue.getVisitedUrlCount() print self.urlQueue.getVisitedUrl()