Example #1
0
class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url, content)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl failed')
        self.output.output_end()
        print('Crawl finish')
Example #2
0
 def store_proc(self, store_q):
     print('store')
     output = DataOutput()
     while True:
         if not store_q.empty():
             data = store_q.get()
             if data == 'end':
                 print('存储进程接受通知然后结束!')
                 output.output_end(output.filepath)
                 return
             output.store_data(data)
         else:
             time.sleep(0.1)
Example #3
0
class Spider:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parse_url(root_url, content)
        for url in urls:
            try:
                # http://service.library.mtime.com/Movie.api
                # ?Ajax_CallBack=true
                # &Ajax_CallBackType=Mtime.Library.Services
                # &Ajax_CallBackMethod=GetMovieOverviewRating
                # &Ajax_CrossDomain=1
                # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526
                t = time.strftime('%Y%m%d%H%M%S3282', time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                           '?Ajax_CallBack=true' \
                           '&Ajax_CallBackType=Mtime.Library.Services' \
                           '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                           '&Ajax_CrossDomain=1' \
                           '&Ajax_RequestUrl=%s' \
                           '&t=%s' \
                           '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1])
                rank_content = self.downloader.download(rank_url)
                if rank_content is None:
                    print('None')
                data = self.parser.parse_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                raise e
                # print(e)
                # print('Crawl failed')

        self.output.output_end()
        print('Crawl finish')