class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url, content) for url in urls: try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: print('Crawl failed') self.output.output_end() print('Crawl finish')
def store_proc(self, store_q: Queue): output = DataOutput() while True: if not store_q.empty(): data = store_q.get() if data == 'end': print('存储进程接收通知然后结束!') output.flush_data() output.output_end(output.filepath) return output.store_data(data) else: time.sleep(0.1)
class Spider(): def __init__(self): self.manager = UrlManager() self.downloader = HTMLDownloader() self.parser = HTMLParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_urls_size() < 50): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_urls_size()) except Exception as e: print(e) self.output.output_html()
class Spider: def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parse_url(root_url, content) for url in urls: try: # http://service.library.mtime.com/Movie.api # ?Ajax_CallBack=true # &Ajax_CallBackType=Mtime.Library.Services # &Ajax_CallBackMethod=GetMovieOverviewRating # &Ajax_CrossDomain=1 # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526 t = time.strftime('%Y%m%d%H%M%S3282', time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1]) rank_content = self.downloader.download(rank_url) if rank_content is None: print('None') data = self.parser.parse_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: raise e # print(e) # print('Crawl failed') self.output.output_end() print('Crawl finish')
def __init__(self): self.manager = UrlManager() self.downloader = HTMLDownloader() self.parser = HTMLParser() self.output = DataOutput()
def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()