class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url,content) for url in urls: try: time.sleep(0.1) t = time.strftime("%Y%m%d%H%M%S",time.localtime()) rank_url ='http://service.library.mtime.com/Movie.api'\ '?Ajax_CallBack=true'\ '&Ajax_CallBackType=Mtime.Library.Services'\ '&Ajax_CallBackMethod=GetMovieOverviewRating'\ '&Ajax_CrossDomain=1'\ '&Ajax_RequestUrl=%s'\ '&t=%s'\ '&Ajax_CallBackArgument0=%s'% (url[0],t,url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url,rank_content) self.output.store_data(data) except Exception,e: print 'Crawl failed' self.output.output_end() print "Crawl finish"
class Main(object): def __init__(self): self.download = HtmlDownloader() self.par = HtmlParser() def action(self, root_url): html = self.download.download(root_url) #通过解析器解析当前页面的url urls = self.par.parser_url(html) #遍历url并拼接url for url in urls: t = time.strftime("%Y%m%d%H%M%S2877", time.localtime()) new_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s" % ( url[0], t, url[1]) print new_url #新的url交给下载器下载 detail_html = self.download.download(new_url) print detail_html self.par.parser_json(detail_html)
class SpiderMan(object): def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): text = self.downloader.download(root_url) new_urls = self.parser.parser_json(root_url, text) self.manager.add_new_urls(new_urls) while (self.manager.has_new_url() and self.manager.old_url_size() < 10): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) data = self.parser.parser(new_url, html) self.output.store_data(data) except Exception as e: print("crawl failed %s" % e) self.output.output()
class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url,content) #构造一个获取评分和票房链接,类似 #http://service.library.mtime.com/Movie.api? # Ajax_CallBack=true # &Ajax_CallBackType=Mtime.Library.Services # &Ajax_CallBackMethod=GetMovieOverviewRating # &Ajax_CrossDomain=1 # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F108737%2F # &t=2016 11 13 22 31 49 3282 # &Ajax_CallBackArgument0=108737 for url in urls: try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url ='http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s'%(url[0],t,url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url,rank_content) self.output.store_data(data) except Exception as e: print("Crawl failed") self.output.output_end() print("Crawl finish")
class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.store_db = StorageSQL.Storage() self.store_xlsx = StorageXlsx.Storage() def crawl_db(self, root_url): ''' Initialize the crawler with an original url --database version :param root_url: The original url :return: ''' content = self.downloader.root_download(root_url) urls = self.parser.parser_url(root_url, content) i = 0 for url in urls: try: t = time.strftime('%Y%m%d%H%MS3282', time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.store_db.store_data(data) i += 1 print('Crawling completion: %s times.' % i) except Exception: print('Crawling failed!') self.store_db.store_end() print('Crawling finished! Exiting programme...') def crawl_xlsx(self, root_url): ''' Initialize the crawler with an original url --xlsx version :param root_url: The original url :return: ''' content = self.downloader.root_download(root_url) urls = self.parser.parser_url(root_url, content) i = 0 row = 1 self.store_xlsx.write_head() for url in urls: try: i += 1 row += 1 t = time.strftime('%Y%m%d%H%MS3282', time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.store_xlsx.write_data(row, data) print('Crawling completion: %s times.' % i) except Exception: print('Crawling failed!') self.store_xlsx.write_end() print('Crawling finished! Exiting programme...')