Example #1
0
 def __init__(self):
     super(SpiderMain, self).__init__()
     self.ts_url = 'http://127.0.0.1:8000/api/lol/save_news'
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.ts = ts_data.TsData(self.ts_url)
 def __init__(self):
     self.url_manager = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.google_fetcher = html_google_fetcher.HtmlGoogleParser()
     self.pr_calculator = page_rank_util.PRCalculator()
     self.url_parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
Example #3
0
    def __init__(self):
        super(SpiderMain, self).__init__()

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()
Example #4
0
 def __init__(self):
     # 实例化需引用的对象
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownLoader()
     self.output = html_output.HtmlOutPut()
     self.parser = html_parser.HtmlParser()
     self.errorList = []
Example #5
0
 def __init__(self):
     self.maxcount = 1000  #设置最大抓取数据数量
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.pdf = pdf_download.PdfDownload()
     self.pic = download.PicDowload()
Example #6
0
 def __init__(self,path=""):
     self.urls = url_manager.UrlManager(path)
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.root_url = "http://baike.baidu.com/view/21087.htm"
     self.path = path
     self.new = True if len(path)==0 else False
Example #7
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     # self.targetKeywords = ['B-box', 'beatbox', 'bbox', 'Beatbox']
     self.targetKeywords = ['三峡']
Example #8
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownLoader()
     self.parser = html_parser.HTMLParser()
     self.ip_pool = ip_pool.IPPool()
     self.session = Session()
     self.ip_pools = self.ip_pool.get_ip_pools(self.session)
 def __init__(self):
     # url管理器
     self.urls = url_manager.UrlManager()
     # 网页下载器
     self.downloader = html_downloader.HtmlDowmloader()
     # 网页解析器
     self.parser = html_parser.HtmlParser()
Example #10
0
    def __init__(self):
        # 初始化配置
        cf = ConfigParser.ConfigParser()
        cf.read("config.conf")
        self.projectid = '%s' % cf.get("start", "project_id")
        self.root_url = '%s' % cf.get("start", "root_url")
        self.number = '%s' % cf.get("start", "number")

        # 启动表连接
        db = mysqldbhand()
        db.dbconnect()
        db.init_tables(self.projectid)
        project = db.FindAll('project', '*', where='id= %s' % (self.projectid))
        project_field = db.FindAll('project_field',
                                   '*',
                                   where='pid= %s' % (self.projectid))
        self.tablename = project[0][2] + '_content'
        # 加载URL管理器
        self.urls = url_manager.UrlManager(self.tablename)
        # 加载下载器
        self.downloader = html_downloader.HtmlDownloader()
        # 加载页面解析器
        self.parse = html_parser.HtmlParser(self.tablename, project,
                                            project_field)
        # 加载入库程序
        self.outputer = html_outputer.HtmlOutputer(self.tablename)
Example #11
0
 def __init__(self):
     """初始化
     建立四个模块的实例
     """
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.download_list = [
         'tv', 'korean_tv', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu'
     ]
Example #13
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_request.HtmlRequests()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.download_list = [
         'gq_movie', 'hao6_mj', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu'
     ]
Example #14
0
 def __init__(self):
     self.urlManager = url_manager.UrlManager()
     self.parser = html_parser.HtmlParser()
     self.downloader = html_downloader.HtmlDownloader()
     self.collector = data_collector.DataCollector()
     self.lock = threading.Lock()  #线程锁
     self.local_crawed = threading.local()  #创建全局ThreadLoacl对象,让每个线程拥有自己的数据。
     self.count = 0  #全局爬取页面计数
Example #15
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.output = html_output.HtmlOutput()
     self.headers = {
         "User_Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36"
     }
Example #16
0
    def __init__(self, root_url):

        self.url = root_url

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_output.HtmlOutputer()
Example #17
0
    def __init__(self):
        # 初始化所需要的对象,包括url管理器,网页下载器,网页解析器,输出器
        # 来提供给craw()使用

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()
Example #18
0
 def __init__(self, isuse, connection):
     self.config = config
     self.connection = connection
     self.urls = url_manager.UrlManager(connection, isuse)
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParse()
     self.outputer = html_outputer.HtmlOutputer()
     self.imgdownloader = img_downloader.ImgDownloader()
Example #19
0
    def __init__(self):

        self.urls = url_manager.UrlManager()

        self.downloader = html_downloader.HtmlDownload()

        self.parser = html_parser.HtmlParser()

        self.mkdirs = test_mkdir.MakeDir()
Example #20
0
 def __init__(self):
     # url管理器
     self.urls = url_manager.UrlManager()
     # 下载器
     self.downloader = html_downloader.HtmlDownloader()
     # 解析器
     self.parser = html_parser.HtmlParse()
     # 输出器        
     self.outputer = html_outputer.HtmlOutputer()
Example #21
0
 def __init__(self):
     # url管理器
     self.urls = url_manager.UrlManager()
     # 网页下载器
     self.downloader = html_downloader.HtmlDowmloader()
     # 网页解析器
     self.parser = html_parser.HtmlParser()
     # 文件读写工具
     self.outputer = file_outputer.Outputer()
Example #22
0
 def __init__(self):
     #URL管理器
     self.urls = url_manager.UrlManager()
     #HTML下载器
     self.downloader = html_downloader.HtmlDownloader()
     #HTML解析器
     self.parser = html_parser.HtmlParser()
     #HTML输出器
     self.outputer = html_outputer.HtmlOutputer()
 def __init__(self):
     # 初始化爬虫的管理器
     self.manager = url_manager.UrlManager()
     # 初始化输出器
     self.outputer = html_outputer.Outputer()
     # 初始化解析器
     self.parser = html_parser.Parser()
     # 初始化下载器
     self.downloader = html_downloader.Downloader()
Example #24
0
 def __init__(self):
     # urls 作为管理器
     self.urls = url_manager.UrlManager()
     # downloader作为下载器
     self.downloader = html_downloader.HtmlDownloader()
     # parser作为解析器
     self.parser = html_parser.HtmlParser()
     # outputer 将数据处理好的数据写出到 html 的页面
     self.outputer = html_outputer.HtmlOutputer()
Example #25
0
 def __init__(self):
     self.urls = url_manager.UrlManager(
     )  # url管理器 管理2个集合 分别存放已抓取的url和待抓取的url
     # 提供4个方法:has_new_url, get_new_url, add_new_url, add_new_urls
     self.downloader = html_downloader.HtmlDownloader()  #下载器
     # 提供 1个方法download(url): 给定url返回字符串
     self.parser = html_parser.HtmlParser()  # html页面解析器
     # 提供1个方法parse(new_url, html_cont) 返回页面解析得到的urls和data
     self.outputer = html_outputer.HtmlOutputer()  # 输出器
Example #26
0
 def __init__(self):
     # Url manager
     self.urls = url_manager.UrlManager()
     # Url downloader
     self.downloader = html_downloader.HtmlDownloader()
     # Url parser
     self.parser = html_parser.HtmlParser()
     # Url output device
     self.outputer = html_outputer.HtmlOutputer()
Example #27
0
 def __init__(self):
     # 获取URL管理器
     self.urls = url_manager.UrlManager()
     # 获取网页下载器
     self.downloader = html_downloader.HtmlDownloader()
     # 获取网页解析器
     self.parser = html_parser.HtmlParser()
     # 获取数据输出器
     self.output = html_outputer.HtmlOutput()
Example #28
0
 def __init__(self):
     super(SpiderMain, self).__init__()
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.uploader = data_uploader.DataUploader()
     self.localDataManager = local_data_manager.LocalDataManager()
     self.pdfDealer = fang_main.PDF_Data_Dealer(self.localDataManager)
Example #29
0
 def __init__(self):
     self.urls = url_manager.UrlManager()  # url管理器
     self.downloder = html_downloader.HtmlDownloader()  # 网页下载器
     self.parser = html_parser.HtmlParser()  # 51JOB网页解析器
     self.dataanalyse = data_analyse.DataAnalyse()  #数据分析
     self.datapicture = data_picture.DataPicture()  #数据可视化
     self.datapicturepie = data_picture_pie.ShowJodSalary()  #数据可视化pie
     self.yy = yuyin.YuYin()  #语音播报
     #self.delect = del_huancun.DelHuancun()  #清理语音数据缓存
     self.user_agent = user_agent.Random_user_agent()  #随机请求头
Example #30
0
    def __init__(self):  #初始化各个模块.

        # 定义url管理器的对象
        self.urls = url_manager.UrlManager()
        # html下载器
        self.downloader = html_downloader.HtmlDownloader()
        # html解析器
        self.parser = html_parser.HtmlParser()
        # html输出器
        self.outputer = html_outputer.HtmlOutputer()