class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParse() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.downloader(root_url) urls = self.parser.parser_url(root_url, content) # 构造一个获取票房连接的URL for url in urls: print '---------->URL', url, url[0], url[1] try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]) rank_content = self.downloader.downloader(rank_url) print 'ajax接口返回内容,汉字正常显示-------->', rank_content print 'ajax接口返回内容type为unicode-------->', type(rank_content) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception, e: print '获取ajax动态数据失败', e self.output.output_end() print '=======end========='
class SpiderMain(object): def __init__(self): self.crawl = Htmldownloarder() self.jsondata = HtmlParse() def run(self): page = '1' time_from = '20190722' time_to = '20190729' response = self.crawl.getresponse(page, time_from, time_to) # user_list = self.jsondata.parse(response) # 拿到用户id之后就可以请求相册内容了 for each in user_list: user_photo_response = self.crawl.photo_infrom(each) self.jsondata.photo_parse(user_photo_response)
class SpiderManager(object): def __init__(self): self.urlManager = UrlManager.UrlManager() self.htmlParse = HtmlParse() self.htmlDownload = HtmlDownloader.HtmlDownload() self.dataSave = DataSave.DataSave() def crawl(self, root_url): self.urlManager.add_new_urls(root_url) while(self.urlManager.has_new_url() and self.urlManager.get_old_url_size() < 10): """ 暂时只下载10个url """ try: new_url = self.urlManager.get_new_url() cont = self.htmlDownload.download_get(new_url) new_urls, new_data = self.htmlParse.do_parser(new_url, cont) self.urlManager.add_new_urls(new_urls) self.dataSave.store_data(new_data) print "已经抓取%s个链接" % self.urlManager.get_old_url_size() except Exception, e: print "crawl fail %s" % e self.dataSave.output_data()
class Schedul(object): def __init__(self): self._manager = UrlManager() self._download = HtmlDownload() self._parse = HtmlParse() self._output = DataOutput() def crawl(self, root_url): # 入口url self._manager.add_new_url(root_url) # 判断是否还有新的url while (self._manager.has_new_url()): try: # 从URL管理器中获取URL new_url = self._manager.get_new_url() # 下载网页 response = self._download.download(new_url) # 解析抽取网页数据 new_url, data = self._parse.parse(new_url, response) # 将抽取到的新的urls添加到URL管理器中 self._manager.add_new_url(new_url) # 数据存储器存储数据 self._output.store_data(data) print('已经抓取了 %s 个链接' % self._manager.old_url_size()) print(new_url) except Exception as e: print('crawl Error', e) #数据存储器将文件输出成指定格式 try: self._output.output_db() except: print('output Error')
class SpiderMan(object): def __init__(self): self.manger = UrlManager() self.download = HtmlDownload() self.parse = HtmlParse() self.outpu = DataOuput() def crawl(self, root_url): ''' 添加入口url :param root_url: :return: ''' self.manger.add_new_url(root_url) # 判断url管理器中是否有新的url地址,同时判断抓取了多少个url while (self.manger.has_new_url() and self.manger.old_url_size() < 100): try: # 从URL管理器中获取新的URL地址 new_url = self.manger.get_new_url() # html下载器进行页面下载 html = self.download.download(new_url) # html解析获取数据 new_urls, data = self.parse.parse(new_url, html) # 将获取到的url地址添加到url管理器中 self.manger.add_new_urls(new_urls) # 数据存储 self.outpu.store_data(data) self.outpu.ouput_html() print('已抓取%s个链接' % self.manger.old_url_size()) except Exception as e: print('crawl fail', e)
class SpiderWork(object): def __init__(self): print "spider work init start" BaseManager.register("get_task_queue") BaseManager.register("get_result_queue") server_addr = '127.0.0.1' self.manager = BaseManager(address=(server_addr, 8101), authkey='ZF') try: self.manager.connect() except BaseException, e: print e self.task = self.manager.get_task_queue() # type: Queue print self.task, self.task.empty() self.result = self.manager.get_result_queue() # type: Queue self.downloader = HtmlDownload() self.parser = HtmlParse() print "spider work init finish"
def __init__(self): self.urlManager = UrlManager.UrlManager() self.htmlParse = HtmlParse() self.htmlDownload = HtmlDownloader.HtmlDownload() self.dataSave = DataSave.DataSave()
def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParse() self.output = DataOutput()
def __init__(self): self._manager = UrlManager() self._download = HtmlDownload() self._parse = HtmlParse() self._output = DataOutput()
def __init__(self): self.manger = UrlManager() self.download = HtmlDownload() self.parse = HtmlParse() self.outpu = DataOuput()
from HtmlParse import HtmlParse from entity.Music import Music from entity.ShareMusic import ShareMusic if __name__ == '__main__': songurl = 'https://music.163.com/song?id=4989687&userid=503583378' share_music = ShareMusic(songurl) songid = share_music.songid music = Music(songid) HtmlParse(songurl).parse_html(music) print(music)
def __init__(self): self.crawl = Htmldownloarder() self.jsondata = HtmlParse()