def __init__(self):
     """构造函数,初始化属性"""
     self.urls = UrlManager()
     self.log = MyLog("spider_main", "logs")
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
Beispiel #2
0
 def __init__(self):
     """
     初始化方法,主要是将其他组件实例化
     """
     self.url_manager = UrlManager()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.data_storage = DataStorage()
Beispiel #3
0
 def __init__(self):
     # URL 管理器
     # self.urls = UrlManager.UrlManager()
     self.urls = UrlManager()
     # URL 下载器
     # self.downloader = HtmlDownloader.HtmlDownloader()
     self.downloader = HtmlDownloader()
     # URL 解析器
     # self.parser = html_parser.HtmlParser()
     self.parser = HtmlParser()
     # self.outputer = html_outputer.HtmlOutputer()
     self.outputer = HtmlOutputer()
Beispiel #4
0
 def __init__(self):
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     server_addr = '127.0.0.1'
     print('Connect to server %s...' % server_addr)
     self.m = BaseManager(address=(server_addr,8001),authkey=b'baike')
     self.m.connect()
     self.task = self.m.get_task_queue()
     print(self.task.qsize())
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
Beispiel #5
0
 def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'):
     """初始化分布式进程中工作节点的连接工作"""
     # 注册用于获取Queue的方法名称
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 连接到服务器
     print('Connect to server %s:%s...' % (address, port))
     self.manager = BaseManager(address=(address, port), authkey=authkey)
     # 开始连接
     self.manager.connect()
     # 获取Queue对象
     self.task_q = self.manager.get_task_queue()
     self.result_q = self.manager.get_result_queue()
     # 初始化下载器和解析器
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
Beispiel #6
0
 def __init__(self):
     # 实例化其他模块类
     self.mysql_handler = MysqlHandler()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     # 爬取起点url
     self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
     # 用于后续url的拼接
     self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
     # 省页面列表
     self.province_url_list = []
     # 市页面列表
     self.city_url_list = []
     # 区页面列表
     self.county_url_list = []
     # 乡镇、街道页面列表
     self.town_url_list = []
 def __init__(self):
     args = ArgumentParser()
     index_start = 1
     try:
         with open(args.index_end_path, 'r', encoding='utf-8') as f:
             index_end = int(f.readline().strip('\n'))
     except Exception as e:
         print(e)
         sys.exit(-1)
     self.new_urls = set()
     print("Adding all urls ...")
     for index in range(index_start, index_end):
         url = "https://baike.baidu.com/view/" + str(index)
         self.new_urls.add(url)
     print("Done.")
     self.old_urls = set()
     self.fail_urls = set()
     self.fail_url_mark = True
     self.downloader = HtmlDownloader()
Beispiel #8
0
    def craw(self):
        # 下载
        downloader = HtmlDownloader()

        root_cont = downloader.download(self.url)
        parser = HtmlParser()
        urls, data = parser.parse(self.url, root_cont, True)
        result = ""
        for url in urls:
            cont = downloader.download(url)
            newurls, month = parser.parse(url, cont, False)
            if month != None:
                result += month.getMonthly()
            month = None
            #print(month.getMonthly())

        f = open("阿里巴巴数据库内核组月报.md", "w+", encoding='utf-8')
        result = "## 阿里巴巴数据库内核月报\n\n" + result
        f.write(result)
        f.close()

        pass
Beispiel #9
0
 def __init__(self):
     self.urls = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
 def __init__(self):
     # 实例化其他模块类
     self.mysql_handler = MysqlHandler()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
 def __init__(self):
     self.url_manager = UrlManager()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.data_storage = DataStorage()
Beispiel #12
0
 def __init__(self):
     # 实例化其他模块类
     #self.mysql_handler = MysqlHandler()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.path = "/Users/spike/python_项目/get_cd_school/"
Beispiel #13
0
from html_downloader import HtmlDownloader
from html_paraser import HtmlParser
import pymysql
from date_provider import getAllDayPerYear
import time

conn = pymysql.connect(host='192.168.64.135',
                       port=3306,
                       user='******',
                       passwd='123456',
                       db='comp')
cursor = conn.cursor()

if __name__ == '__main__':
    hd = HtmlDownloader()
    hp = HtmlParser()

    province = 'zhejiang'
    for year in range(2019, 1949, -1):
        print(year)
        year_date_list = getAllDayPerYear(year)
        # print(year_date_list)
        for comregdate in year_date_list:
            print(comregdate)
            errcnt = 0
            pagecnt_tmp = 0
            for pagecnt in range(0, 1000):

                url = r'https://gongshang.mingluji.com/' + province + r'/riqi/' + comregdate + r'?page=' + str(
                    pagecnt)
Beispiel #14
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
# -*- coding: utf-8 -*-
# @Author: cyb

from html_downloader import HtmlDownloader

downloader = HtmlDownloader()
html_content = downloader.download(
    url='https://baike.baidu.com/item/Python/407313')
print(html_content)

# 测试得到响应的htmlcontent,经过比对是纯静态页面,想要的数据都已包含。
# # resp.text自动转换偶尔会得到乱码,所以改为resp.content.decode()