class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url, content)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl failed')
        self.output.output_end()
        print('Crawl finish')
Exemple #2
0
 def store_proc(self, store_q: Queue):
     output = DataOutput()
     while True:
         if not store_q.empty():
             data = store_q.get()
             if data == 'end':
                 print('存储进程接收通知然后结束!')
                 output.flush_data()
                 output.output_end(output.filepath)
                 return
             output.store_data(data)
         else:
             time.sleep(0.1)
Exemple #3
0
class Spider():
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HTMLDownloader()
        self.parser = HTMLParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_urls_size() < 50):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_urls_size())
            except Exception as e:
                print(e)
        self.output.output_html()
Exemple #4
0
class Spider:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parse_url(root_url, content)
        for url in urls:
            try:
                # http://service.library.mtime.com/Movie.api
                # ?Ajax_CallBack=true
                # &Ajax_CallBackType=Mtime.Library.Services
                # &Ajax_CallBackMethod=GetMovieOverviewRating
                # &Ajax_CrossDomain=1
                # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526
                t = time.strftime('%Y%m%d%H%M%S3282', time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                           '?Ajax_CallBack=true' \
                           '&Ajax_CallBackType=Mtime.Library.Services' \
                           '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                           '&Ajax_CrossDomain=1' \
                           '&Ajax_RequestUrl=%s' \
                           '&t=%s' \
                           '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1])
                rank_content = self.downloader.download(rank_url)
                if rank_content is None:
                    print('None')
                data = self.parser.parse_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                raise e
                # print(e)
                # print('Crawl failed')

        self.output.output_end()
        print('Crawl finish')
Exemple #5
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HTMLDownloader()
     self.parser = HTMLParser()
     self.output = DataOutput()
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()