Esempio n. 1
0
class SpiderMan(object):  
  def __init__(self):
    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

  def crawl(self,root_url):
    content = self.downloader.download(root_url)
    urls = self.parser.parser_url(root_url,content)
    for url in urls:
      try:
        time.sleep(0.1)
        t = time.strftime("%Y%m%d%H%M%S",time.localtime())
        rank_url ='http://service.library.mtime.com/Movie.api'\
        '?Ajax_CallBack=true'\
        '&Ajax_CallBackType=Mtime.Library.Services'\
        '&Ajax_CallBackMethod=GetMovieOverviewRating'\
        '&Ajax_CrossDomain=1'\
        '&Ajax_RequestUrl=%s'\
        '&t=%s'\
        '&Ajax_CallBackArgument0=%s'% (url[0],t,url[1])
        rank_content = self.downloader.download(rank_url)
        data = self.parser.parser_json(rank_url,rank_content)
        self.output.store_data(data)
      except Exception,e:
        print 'Crawl failed'
    self.output.output_end()
    print "Crawl finish"
Esempio n. 2
0
class Main(object):
    def __init__(self):
        self.download = HtmlDownloader()
        self.par = HtmlParser()

    def action(self, root_url):
        html = self.download.download(root_url)
        #通过解析器解析当前页面的url
        urls = self.par.parser_url(html)
        #遍历url并拼接url
        for url in urls:
            t = time.strftime("%Y%m%d%H%M%S2877", time.localtime())
            new_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s" % (
                url[0], t, url[1])
            print new_url
            #新的url交给下载器下载
            detail_html = self.download.download(new_url)
            print detail_html
            self.par.parser_json(detail_html)
Esempio n. 3
0
class SpiderMan(object):
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        text = self.downloader.download(root_url)
        new_urls = self.parser.parser_json(root_url, text)
        self.manager.add_new_urls(new_urls)
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 10):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                data = self.parser.parser(new_url, html)
                self.output.store_data(data)
            except Exception as e:
                print("crawl failed %s" % e)
        self.output.output()
Esempio n. 4
0
class SpiderMan(object):

    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
    def crawl(self,root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url,content)
        #构造一个获取评分和票房链接,类似
        #http://service.library.mtime.com/Movie.api?
        # Ajax_CallBack=true
        # &Ajax_CallBackType=Mtime.Library.Services
        # &Ajax_CallBackMethod=GetMovieOverviewRating
        # &Ajax_CrossDomain=1
        # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F108737%2F
        # &t=2016 11 13 22 31 49 3282
        # &Ajax_CallBackArgument0=108737
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url ='http://service.library.mtime.com/Movie.api' \
                          '?Ajax_CallBack=true' \
                          '&Ajax_CallBackType=Mtime.Library.Services' \
                          '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                          '&Ajax_CrossDomain=1' \
                          '&Ajax_RequestUrl=%s' \
                          '&t=%s' \
                          '&Ajax_CallBackArgument0=%s'%(url[0],t,url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url,rank_content)
                self.output.store_data(data)
            except Exception as e:
                 print("Crawl failed")
        self.output.output_end()
        print("Crawl finish")
Esempio n. 5
0
class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.store_db = StorageSQL.Storage()
        self.store_xlsx = StorageXlsx.Storage()

    def crawl_db(self, root_url):
        '''
        Initialize the crawler with an original url --database version
        :param root_url: The original url
        :return:
        '''
        content = self.downloader.root_download(root_url)
        urls = self.parser.parser_url(root_url, content)
        i = 0
        for url in urls:
            try:
                t = time.strftime('%Y%m%d%H%MS3282', time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                           '?Ajax_CallBack=true' \
                           '&Ajax_CallBackType=Mtime.Library.Services' \
                           '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                           '&Ajax_CrossDomain=1' \
                           '&Ajax_RequestUrl=%s' \
                           '&t=%s' \
                           '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.store_db.store_data(data)
                i += 1
                print('Crawling completion: %s times.' % i)
            except Exception:
                print('Crawling failed!')
        self.store_db.store_end()
        print('Crawling finished! Exiting programme...')

    def crawl_xlsx(self, root_url):
        '''
        Initialize the crawler with an original url --xlsx version
        :param root_url: The original url
        :return:
        '''
        content = self.downloader.root_download(root_url)
        urls = self.parser.parser_url(root_url, content)
        i = 0
        row = 1
        self.store_xlsx.write_head()
        for url in urls:
            try:
                i += 1
                row += 1
                t = time.strftime('%Y%m%d%H%MS3282', time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                           '?Ajax_CallBack=true' \
                           '&Ajax_CallBackType=Mtime.Library.Services' \
                           '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                           '&Ajax_CrossDomain=1' \
                           '&Ajax_RequestUrl=%s' \
                           '&t=%s' \
                           '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.store_xlsx.write_data(row, data)
                print('Crawling completion: %s times.' % i)
            except Exception:
                print('Crawling failed!')
        self.store_xlsx.write_end()
        print('Crawling finished! Exiting programme...')