コード例 #1
0
class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url, content)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl failed')
        self.output.output_end()
        print('Crawl finish')
コード例 #2
0
 def __init__(self):
     """构造函数,初始化属性"""
     self.urls = UrlManager()
     self.log = MyLog("spider_main", "logs")
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
コード例 #3
0
ファイル: spider_main.py プロジェクト: KIMI-Z/python-spider
class SpiderMain:

    def __init__(self):
        """
        初始化方法,主要是将其他组件实例化
        """
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.data_storage = DataStorage()

    def start(self):
        """
        爬虫的主启动方法
        :return:
        """
        """ 页码 """
        title = set()
        for a in range(2, 10):
            html = self.html_downloader.download(
                'http://ggzy.foshan.gov.cn/jyxx/fss/zfcg_1108551/zbxx/index_'+str(a)+'.html?1')
            _title = self.html_parser.titleParer(html)
            for i in _title:
                title.add(i)
        for i in title:
            print(i)
            html = self.html_downloader.download(i)
            _product = self.html_parser.contextParer(html)
            self.data_storage.storage(_product)
コード例 #4
0
 def __init__(self, url):
     self.root_url = url
     self.urlManager = UrlManager()
     self.dLoader = HtmlDLoader()
     self.contParser = HtmlParser()
     self.contOutputer = HtmlOutputer()
     pass
コード例 #5
0
ファイル: spider_main.py プロジェクト: KIMI-Z/python-spider
 def __init__(self):
     """
     初始化方法,主要是将其他组件实例化
     """
     self.url_manager = UrlManager()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.data_storage = DataStorage()
コード例 #6
0
 def __init__(self):
     self.url = UrlManager()
     self.downloader = Downloader()
     self.parser = HtmlParser()
     self.output = OutputUse()
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36',
     }
コード例 #7
0
 def test_get_links(self):
     """
     Tests get_links method
     """
     file_util = FileUtil()
     expected_links = file_util.get_file_contents("links_test_data.txt")
     html_parser = HtmlParser()
     html_test_data = file_util.get_file_contents("html_test_data.html")
     actual_links = html_parser.get_links(html_test_data)
     self.assertEqual(expected_links, actual_links)
コード例 #8
0
 def test_get_web_pages(self):
     """
     Tests get_web_pages method
     """
     file_util = FileUtil()
     expected_web_pages = file_util.get_file_contents("web_pages_test_data.txt")
     html_parser = HtmlParser()
     same_hostname_urls = file_util.get_file_contents("same_hostname_urls_test_data.txt")
     actual_web_pages = html_parser.get_web_pages(same_hostname_urls)
     self.assertEqual(expected_web_pages, actual_web_pages)
コード例 #9
0
 def test_get_same_hostname_urls(self):
     """
     Tests get_same_hostname_urls method
     """
     file_util = FileUtil()
     expected_same_hostname_urls = file_util.get_file_contents("same_hostname_urls_test_data.txt")
     html_parser = HtmlParser()
     hostname = "http://www.domain.com/"
     links = file_util.get_file_contents("links_test_data.txt")
     actual_same_hostname_urls = html_parser.get_same_hostname_urls(hostname, links)
     self.assertEqual(expected_same_hostname_urls, actual_same_hostname_urls)
コード例 #10
0
 def __init__(self):
     # URL 管理器
     # self.urls = UrlManager.UrlManager()
     self.urls = UrlManager()
     # URL 下载器
     # self.downloader = HtmlDownloader.HtmlDownloader()
     self.downloader = HtmlDownloader()
     # URL 解析器
     # self.parser = html_parser.HtmlParser()
     self.parser = HtmlParser()
     # self.outputer = html_outputer.HtmlOutputer()
     self.outputer = HtmlOutputer()
コード例 #11
0
 def parse_html(page_url):
     html_string = ''
     try:
         response = urlopen(page_url, timeout=5)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = HtmlParser(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set(), html_string
     return finder.page_links(), html_string
コード例 #12
0
 def __init__(self):
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     server_addr = '127.0.0.1'
     print('Connect to server %s...' % server_addr)
     self.m = BaseManager(address=(server_addr,8001),authkey=b'baike')
     self.m.connect()
     self.task = self.m.get_task_queue()
     print(self.task.qsize())
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
コード例 #13
0
ファイル: parser.py プロジェクト: YuriiYakymiv/proSapientTest
    async def parse(self):
        retries = PasswordPageParser.RETRIES
        status = None
        while retries > 0 and status != 200:
            html, status = await self.get_page()
            retries -= 1

        if status == 200:
            table_parser = HtmlParser(html)
            table_data = table_parser.parse('get_table_data')

        if table_data:
            arranged_table_data = self.arrange_table_data(table_data)
            return arranged_table_data
コード例 #14
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        self.m = BaseManager(address=(server_addr,8001),authkey=b'baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        print(self.task.qsize())
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        import time
        while(True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        self.result.put({'new_urls':'end', 'data':'end'})
                        return
                    print('爬虫节点正在解析:%s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({'new_urls':new_urls, 'data':data})
            except EOFError as e:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('Crawl fail')
コード例 #15
0
ファイル: spider.py プロジェクト: ZhitongLei/web
	def start(self):
		url_queue = Queue.Queue()
		url_queue.put((self.root_request_info.url, 0))

		request_info = RequestInfo('', None, self.root_request_info.headers)
		fetcher = Fetcher()

		while not url_queue.empty():
			curr_url, depth = url_queue.get()		
			#print 'url=%s, depth=%d' % (curr_url, depth)
			print curr_url

			if depth > self.depth_limit:
				continue
			
			depth += 1
			request_info.url = curr_url
			page_content = fetcher.request(request_info)

			## parse page
			## Content.parse(page_content)

			url_list = HtmlParser.extract_url(curr_url, page_content)
			if url_list:
				for url in url_list:
					url_queue.put((url, depth))
コード例 #16
0
ファイル: spider_main.py プロジェクト: linanster/python
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, url):
        count = 1
        self.urls.add_new_url(url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                html_cont = self.downloader.download(new_url)
                new_urls, html_data = self.parser.parse(new_url, html_cont)
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(html_data)
                print "%d craw success : %s" % (count, new_url)
                if count >= 10:
                    break
                count = count + 1
            except Exception as e:
                print str(e)
                print "%d craw failed : %s" % (count, new_url)
        self.outputer.output()
コード例 #17
0
    def get_parser(self, dom):
        lettingInformationDiv = dom.find("div", id="lettingInformation")

        if lettingInformationDiv:
            return HtmlParser(dom)
        else:
            return PageModelParser(dom)
コード例 #18
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = HtmlOutputer()

    def craw(self, root_url, page_amount=5, time_sleep=None):
        count = 1
        # 添加第一个待爬取url
        self.urls.add_new_url(root_url)
        # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。
        while self.urls.has_new_url():
            try:
                # 开始爬取
                new_url = self.urls.get_new_url()
                print(f'craw{count}:{new_url}')
                # 请求url, 返回html
                html_content = self.downloader.download(new_url)
                # xpath 解析html,得到需要的数据
                new_urls, new_data = self.parser.parse(html_content)
                # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取
                self.urls.add_new_urls(new_urls)
                self.output.collect_data(new_url, new_data)
                count += 1
                if count > page_amount:
                    break

                time.sleep(2)
            except Exception as e:
                print(e)
                print(f'抓取失败:{new_url}')
        self.output.output_html()
コード例 #19
0
 def __init__(self):
     # 实例化其他模块类
     self.mysql_handler = MysqlHandler()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     # 爬取起点url
     self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
     # 用于后续url的拼接
     self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
     # 省页面列表
     self.province_url_list = []
     # 市页面列表
     self.city_url_list = []
     # 区页面列表
     self.county_url_list = []
     # 乡镇、街道页面列表
     self.town_url_list = []
コード例 #20
0
ファイル: spider_worker.py プロジェクト: yeseni-today/toys
 def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'):
     """初始化分布式进程中工作节点的连接工作"""
     # 注册用于获取Queue的方法名称
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 连接到服务器
     print('Connect to server %s:%s...' % (address, port))
     self.manager = BaseManager(address=(address, port), authkey=authkey)
     # 开始连接
     self.manager.connect()
     # 获取Queue对象
     self.task_q = self.manager.get_task_queue()
     self.result_q = self.manager.get_result_queue()
     # 初始化下载器和解析器
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
コード例 #21
0
ファイル: main_spider.py プロジェクト: zunhai/spider_example
class Scheduler(object):
    def __init__(self):
        self.url = UrlManager()
        self.downloader = Downloader()
        self.parser = HtmlParser()
        self.output = OutPutUse()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36',
        }

    def run(self):
        url_seed_main = self.url.get_main_seed_url()
        content = self.downloader.download(url_seed_main,
                                           retry_count=2,
                                           headers=self.headers).decode('utf8')
        subject_urls = self.parser.parse_main_subjects(content)
        for subject_url in subject_urls:
            self._run_subject(subject_url)

    def _run_subject(self, subject_url):
        print('#subject_url#:' + subject_url)
        content = self.downloader.download(subject_url,
                                           retry_count=2,
                                           headers=self.headers).decode('utf8')
        mj_info = self.parser.parse_subject_mj_info(content)
        if mj_info is None:
            return
        mj_max_count = int(mj_info['count'])
        mj_name = str(mj_info['mj_name'])
        cur_count = 1
        index = 1
        while cur_count <= mj_max_count:
            real_url = subject_url
            if index > 1:
                real_url = subject_url[0:len(subject_url) -
                                       5] + ('_' + str(index) + '.html')
            index = index + 1
            # 正常每页的大图个数为4
            cur_count = cur_count + 4
            print('正在获取大图的页面是:' + real_url)
            content = self.downloader.download(
                real_url, retry_count=2, headers=self.headers).decode('utf8')
            pic_urls = self.parser.parse_page_pics(content)
            for pic_url in pic_urls:
                self.output.download_and_save(pic_url, mj_name)
コード例 #22
0
 def main():
     sd = input("Start Date(yyyy,m,d): ")
     ed = input("End Date(yyyy,m,d): ")
     print(datetime.datetime.now())
     multiParsedTagList = hp.get_fullParsedTagList(sd, ed)
     tagSelect = sc.get_singlePageInfo(multiParsedTagList)
     pageInfos = sc.get_pageInfos(tagSelect)
     #pp.print_mergedList(pageInfos)
     pp.save_csv(pageInfos, sd, ed)
     print(datetime.datetime.now())
コード例 #23
0
    def __init__(self, max_tasks=10, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.parser = HtmlParser()
        self.saver = Saver()
        self.url_manager = UrlManager()

        self.max_tasks = max_tasks

        # 初始化队列
        if not self.url_manager:
            self.url_manager.put("http://company.yellowurl.cn/")
コード例 #24
0
ファイル: web_crawler.py プロジェクト: kitlawes/web-crawler
class WebCrawler:
    def __init__(self):
        self.url_util = UrlUtil()
        self.html_requester = HtmlRequester()
        self.html_parser = HtmlParser()

    def crawl(self, url):
        """
        Returns the URLs reachable from the parameter URL
        The assets of each URL are also returned.
        Only URLs with the same hostname including subdomain as the parameter URL are returned.
        """

        url = self.url_util.normalise_url(url)
        hostname = self.url_util.get_hostname(url)

        urlsToVisit = [url]
        urlsVisted = []
        output = []
        # Each iteration of this loop processes the next URL to visit.
        while (len(urlsToVisit) > 0):

            url = urlsToVisit.pop(0)
            urlsVisted.append(url)

            html = self.html_requester.get_html(url)
            links = self.html_parser.get_links(html)
            same_hostname_urls = self.html_parser.get_same_hostname_urls(
                hostname, links)
            assets = self.html_parser.get_assets(same_hostname_urls)
            web_pages = self.html_parser.get_web_pages(same_hostname_urls)

            output.append({"url": url, "assets": assets})
            print json.dumps({"url": url, "assets": assets}, indent=4)

            for web_page in web_pages:
                # Do not visit a page more than once
                if not web_page in urlsToVisit and web_page not in urlsVisted:
                    urlsToVisit.append(web_page)

        return json.dumps(output, indent=4).splitlines()
コード例 #25
0
    def craw(self):
        # 下载
        downloader = HtmlDownloader()

        root_cont = downloader.download(self.url)
        parser = HtmlParser()
        urls, data = parser.parse(self.url, root_cont, True)
        result = ""
        for url in urls:
            cont = downloader.download(url)
            newurls, month = parser.parse(url, cont, False)
            if month != None:
                result += month.getMonthly()
            month = None
            #print(month.getMonthly())

        f = open("阿里巴巴数据库内核组月报.md", "w+", encoding='utf-8')
        result = "## 阿里巴巴数据库内核月报\n\n" + result
        f.write(result)
        f.close()

        pass
コード例 #26
0
ファイル: manager.py プロジェクト: yeseni-today/toys
class Spider:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parse_url(root_url, content)
        for url in urls:
            try:
                # http://service.library.mtime.com/Movie.api
                # ?Ajax_CallBack=true
                # &Ajax_CallBackType=Mtime.Library.Services
                # &Ajax_CallBackMethod=GetMovieOverviewRating
                # &Ajax_CrossDomain=1
                # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526
                t = time.strftime('%Y%m%d%H%M%S3282', time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                           '?Ajax_CallBack=true' \
                           '&Ajax_CallBackType=Mtime.Library.Services' \
                           '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                           '&Ajax_CrossDomain=1' \
                           '&Ajax_RequestUrl=%s' \
                           '&t=%s' \
                           '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1])
                rank_content = self.downloader.download(rank_url)
                if rank_content is None:
                    print('None')
                data = self.parser.parse_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                raise e
                # print(e)
                # print('Crawl failed')

        self.output.output_end()
        print('Crawl finish')
コード例 #27
0
ファイル: spider_worker.py プロジェクト: yeseni-today/toys
class SpiderWorker:
    def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'):
        """初始化分布式进程中工作节点的连接工作"""
        # 注册用于获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 连接到服务器
        print('Connect to server %s:%s...' % (address, port))
        self.manager = BaseManager(address=(address, port), authkey=authkey)
        # 开始连接
        self.manager.connect()
        # 获取Queue对象
        self.task_q = self.manager.get_task_queue()
        self.result_q = self.manager.get_result_queue()
        # 初始化下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):

        while True:
            try:
                if not self.task_q.empty():
                    url = self.task_q.get()

                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其他节点停止工作
                        self.result_q.put({'new_urls': 'end', 'data': 'end'})
                        return

                    print('爬虫节点正在解析: %s' % url)
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parse(url, content)
                    self.result_q.put({'new_urls': new_urls, 'data': data})

                else:
                    print('task queue is empty', self.task_q.empty())
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('crawl fail')
コード例 #28
0
class SpiderMain():
    def __init__(self):
        # URL 管理器
        # self.urls = UrlManager.UrlManager()
        self.urls = UrlManager()
        # URL 下载器
        # self.downloader = HtmlDownloader.HtmlDownloader()
        self.downloader = HtmlDownloader()
        # URL 解析器
        # self.parser = html_parser.HtmlParser()
        self.parser = HtmlParser()
        # self.outputer = html_outputer.HtmlOutputer()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        originSet = set()
        originSet.add(root_url)
        self.urls.add_new_urls(originSet)
        while self.urls.has_new_rul():
            try:
                new_url = self.urls.get_new_url()
                print "craw %d : %s" % (count, new_url)
                html_cont = self.downloader.downloader(new_url)

                # 输出信息
                downStat = "ERROR"
                if html_cont != None:
                    downStat = "SUCCESS"
                    print "[Page ID : %d downloader %s!]" % (count, downStat)

                new_urls, new_data = self.parser.parser(new_url, html_cont)
                # print "\nnew_urls[%s], new_data[%s]" % (new_urls, new_data)

                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(new_data)

                if count == 15:
                    break
                count = count + 1
            except Exception as err:
                print "craw failed! ERROR infomation : %s" % err
        self.outputer.output_html()
コード例 #29
0
ファイル: wikisum.py プロジェクト: TPeterW/summariser
class Crawler(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def get_urls(self, keywords):
        data = {}
        for word in keywords:
            url = self.crawl(word)
            data[word] = url

        return data

    def crawl(self, word):
        results = {}
        url = self.manager.get_url(word)
        page = self.downloader.download(url)

        return self.parser.search(page)
コード例 #30
0
ファイル: wikisum.py プロジェクト: TPeterW/summariser
class Crawler(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def get_urls(self, keywords):
        data = {}
        for word in keywords:
            url = self.crawl(word)
            data[word] = url;
        
        return data
    
    def crawl(self, word):
        results = {}
        url = self.manager.get_url(word);
        page = self.downloader.download(url)
        
        return self.parser.search(page)
コード例 #31
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        #self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.path = "/Users/spike/python_项目/get_cd_school/"
        # # 爬取起点url
        # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1'
        # # 用于后续url的拼接
        # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages='
        # school info
        # self.school_infos = []

    def craw(self, downloading_url):
        try:
            # 记录正在下载、解析的url,便于分析错误
            # downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.school_infos = self.html_parser.province_parser(html_content)
            # print(self.school_infos)
            #exit()
            if (len(self.school_infos) != 20):
                print(downloading_url + "解析成功")
                print("当前页面数据:" + str(len(self.school_infos)))
            #print(self.province_url_list)
            with open(self.path + "school.txt", "a") as f:
                # print("writting")
                for mc, xd, qy, xz, dh, dz in self.school_infos:
                    f.write(mc + "\t" + xd + "\t" + qy + "\t" + xz + "\t" +
                            dh + "\t" + dz)
            f.close()
            return len(self.school_infos)

        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
コード例 #32
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1'
        # # 用于后续url的拼接
        # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages='
        # # school info
        # self.school_infos = []
        #日志文件路径需要自行修改
        # self.last_log_path = "d:\\log.txt"
        # self.last_log_path = "/Users/spike/spider_log.txt"
    def craw(self,downloading_url):
        try:
            # 记录正在下载、解析的url,便于分析错误
            # downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            self.school_infos = self.html_parser.province_parser(html_content)
            # print(self.school_infos)
            if (len(self.school_infos)!=20):
                print(downloading_url+"解析成功")
                print("当前页面数据:"+str(len(self.school_infos)))
            for mc,xd,qy,xz,dh,dz in self.school_infos:
                # print(mc+xd+qy+xz+dh+dz)
                province_id = self.mysql_handler.insert(mc,xd,qy,xz,dh,dz)     
                # print(province_id)
                # exit()
                # 记录正在下载、解析的url,便于分析错误  
            # self.mysql_handler.close()
            return len(self.school_infos)
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
            time.sleep(60)            
コード例 #33
0
ファイル: spider_main.py プロジェクト: ofoxtigero/test
class SpiderMain(object):
    """docstring for SpiderMain"""
    def __init__(self):
        self.urlManage = UrlManage()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()
    def craw(self,url):
        self.urlManage.add_new_url(url)
        
        count = 1
        while self.urlManage.has_new_url():
            url = self.urlManage.get_new_url()
            print '%dth page,address:%s' % (count,url)
            html_content = self.downloader.downloadPage(url)
            new_urls,new_data = self.parser.parse(html_content,url)
            self.urlManage.add_new_urls(new_urls)
            self.outputer.collect_data(new_data)

            if count == 10:
                break

            count = count + 1 
        self.outputer.output_html()
コード例 #34
0
def parse_html(input_file_path):
    result_file_path = "".join([os.path.splitext(input_file_path)[0], '.content'])
    with open(input_file_path, 'r') as html_file:
        doc = HtmlParser(html_file.read(), "lxml")
        doc.write_to_file(result_file_path)
コード例 #35
0
ファイル: spider_main.py プロジェクト: ofoxtigero/test
 def __init__(self):
     self.urlManage = UrlManage()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
コード例 #36
0
ファイル: wikisum.py プロジェクト: TPeterW/summariser
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()