Esempio n. 1
0
class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParse()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.downloader(root_url)
        urls = self.parser.parser_url(root_url, content)
        # 构造一个获取票房连接的URL
        for url in urls:
            print '---------->URL', url, url[0], url[1]
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])
                rank_content = self.downloader.downloader(rank_url)
                print 'ajax接口返回内容,汉字正常显示-------->', rank_content
                print 'ajax接口返回内容type为unicode-------->', type(rank_content)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception, e:
                print '获取ajax动态数据失败', e
        self.output.output_end()
        print '=======end========='
Esempio n. 2
0
class SpiderMain(object):
    def __init__(self):
        self.crawl = Htmldownloarder()
        self.jsondata = HtmlParse()

    def run(self):
        page = '1'
        time_from = '20190722'
        time_to = '20190729'
        response = self.crawl.getresponse(page, time_from, time_to)  #
        user_list = self.jsondata.parse(response)
        # 拿到用户id之后就可以请求相册内容了
        for each in user_list:
            user_photo_response = self.crawl.photo_infrom(each)
            self.jsondata.photo_parse(user_photo_response)
class SpiderManager(object):

    def __init__(self):

        self.urlManager = UrlManager.UrlManager()
        self.htmlParse = HtmlParse()
        self.htmlDownload = HtmlDownloader.HtmlDownload()
        self.dataSave = DataSave.DataSave()

    def crawl(self, root_url):
        self.urlManager.add_new_urls(root_url)

        while(self.urlManager.has_new_url()
              and self.urlManager.get_old_url_size() < 10):
            """
            暂时只下载10个url
            """
            try:
                new_url = self.urlManager.get_new_url()
                cont = self.htmlDownload.download_get(new_url)
                new_urls, new_data = self.htmlParse.do_parser(new_url, cont)
                self.urlManager.add_new_urls(new_urls)
                self.dataSave.store_data(new_data)

                print "已经抓取%s个链接" % self.urlManager.get_old_url_size()
            except Exception, e:
                print "crawl fail %s" % e

        self.dataSave.output_data()
Esempio n. 4
0
class Schedul(object):
    def __init__(self):
        self._manager = UrlManager()
        self._download = HtmlDownload()
        self._parse = HtmlParse()
        self._output = DataOutput()

    def crawl(self, root_url):
        # 入口url
        self._manager.add_new_url(root_url)
        # 判断是否还有新的url
        while (self._manager.has_new_url()):
            try:
                # 从URL管理器中获取URL
                new_url = self._manager.get_new_url()
                # 下载网页
                response = self._download.download(new_url)
                # 解析抽取网页数据
                new_url, data = self._parse.parse(new_url, response)
                # 将抽取到的新的urls添加到URL管理器中
                self._manager.add_new_url(new_url)
                # 数据存储器存储数据
                self._output.store_data(data)
                print('已经抓取了 %s 个链接' % self._manager.old_url_size())
                print(new_url)
            except Exception as e:
                print('crawl Error', e)
        #数据存储器将文件输出成指定格式
            try:
                self._output.output_db()
            except:
                print('output Error')
Esempio n. 5
0
class SpiderMan(object):
    def __init__(self):
        self.manger = UrlManager()
        self.download = HtmlDownload()
        self.parse = HtmlParse()
        self.outpu = DataOuput()

    def crawl(self, root_url):
        '''
        添加入口url
        :param root_url:
        :return:
        '''
        self.manger.add_new_url(root_url)
        # 判断url管理器中是否有新的url地址,同时判断抓取了多少个url
        while (self.manger.has_new_url() and self.manger.old_url_size() < 100):
            try:
                # 从URL管理器中获取新的URL地址
                new_url = self.manger.get_new_url()
                # html下载器进行页面下载
                html = self.download.download(new_url)
                # html解析获取数据
                new_urls, data = self.parse.parse(new_url, html)
                # 将获取到的url地址添加到url管理器中
                self.manger.add_new_urls(new_urls)
                # 数据存储
                self.outpu.store_data(data)
                self.outpu.ouput_html()
                print('已抓取%s个链接' % self.manger.old_url_size())
            except Exception as e:
                print('crawl fail', e)
Esempio n. 6
0
class SpiderWork(object):
    def __init__(self):
        print "spider work init start"

        BaseManager.register("get_task_queue")
        BaseManager.register("get_result_queue")

        server_addr = '127.0.0.1'
        self.manager = BaseManager(address=(server_addr, 8101), authkey='ZF')
        try:
            self.manager.connect()
        except BaseException, e:
            print e

        self.task = self.manager.get_task_queue()  # type: Queue

        print self.task, self.task.empty()

        self.result = self.manager.get_result_queue()  # type: Queue
        self.downloader = HtmlDownload()
        self.parser = HtmlParse()
        print "spider work init finish"
    def __init__(self):

        self.urlManager = UrlManager.UrlManager()
        self.htmlParse = HtmlParse()
        self.htmlDownload = HtmlDownloader.HtmlDownload()
        self.dataSave = DataSave.DataSave()
Esempio n. 8
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParse()
     self.output = DataOutput()
Esempio n. 9
0
 def __init__(self):
     self._manager = UrlManager()
     self._download = HtmlDownload()
     self._parse = HtmlParse()
     self._output = DataOutput()
Esempio n. 10
0
 def __init__(self):
     self.manger = UrlManager()
     self.download = HtmlDownload()
     self.parse = HtmlParse()
     self.outpu = DataOuput()
Esempio n. 11
0
from HtmlParse import HtmlParse
from entity.Music import Music
from entity.ShareMusic import ShareMusic

if __name__ == '__main__':
    songurl = 'https://music.163.com/song?id=4989687&userid=503583378'
    share_music = ShareMusic(songurl)
    songid = share_music.songid
    music = Music(songid)
    HtmlParse(songurl).parse_html(music)
    print(music)
Esempio n. 12
0
 def __init__(self):
     self.crawl = Htmldownloarder()
     self.jsondata = HtmlParse()