Beispiel #1
0
 def __init__(self, count, urls):
     threading.Thread.__init__(self)
     self.count = count
     self.urls = urls
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
Beispiel #2
0
    def __init__(self):
        super(SpiderMain, self).__init__()

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()
Beispiel #3
0
 def __init__(self):
     #第一個是模塊的名字,第二個是模塊內部對象的名字
     #html_download.HtmlDownload()
     self.html_download = html_download3.HtmlDownload()
     self.html_parser = HtmlParse.HtmlParse()
     self.urlmanage = UrlManage.UrlManage()
     self.html_output = html_outputer.HtmlOutputer()
Beispiel #4
0
    def __init__(self):
        # 初始化配置
        cf = ConfigParser.ConfigParser()
        cf.read("config.conf")
        self.projectid = '%s' % cf.get("start", "project_id")
        self.root_url = '%s' % cf.get("start", "root_url")
        self.number = '%s' % cf.get("start", "number")

        # 启动表连接
        db = mysqldbhand()
        db.dbconnect()
        db.init_tables(self.projectid)
        project = db.FindAll('project', '*', where='id= %s' % (self.projectid))
        project_field = db.FindAll('project_field',
                                   '*',
                                   where='pid= %s' % (self.projectid))
        self.tablename = project[0][2] + '_content'
        # 加载URL管理器
        self.urls = url_manager.UrlManager(self.tablename)
        # 加载下载器
        self.downloader = html_downloader.HtmlDownloader()
        # 加载页面解析器
        self.parse = html_parser.HtmlParser(self.tablename, project,
                                            project_field)
        # 加载入库程序
        self.outputer = html_outputer.HtmlOutputer(self.tablename)
Beispiel #5
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     # self.targetKeywords = ['B-box', 'beatbox', 'bbox', 'Beatbox']
     self.targetKeywords = ['三峡']
Beispiel #6
0
 def __init__(self,xing,ming):
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.xing=xing
     self.ming=ming
     self.param={'origin':'searchauthorlookup',
                'src':'al',
                'edit':'',
                'poppUp':'',
                'basicTab':'',
                'affiliationTab':'',
                'advancedTab':'',
                'st1':xing,
                'st2':ming,
                'institute':'',
                '_exactSearch':'on',
                'orcidId':'',
                #'authSubject':'LFSC',
                '_authSubject':'on',
                #'authSubject':'HLSC',
                 '_authSubject':'on',
                 #'authSubject':'PHSC',
                 '_authSubject':'on',
                 #'authSubject':'SOSC',
                 '_authSubject':'on',
                 's':'AUTH--LAST--NAME({0}) AND AUTH--FIRST({1})'.format(ming,xing),
                 'sdt':'al',
                 'sot':'al',
                 #'searchId':sid,
                 #'sid':sid
                }
 def __init__(self):
     self.url_manager = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.google_fetcher = html_google_fetcher.HtmlGoogleParser()
     self.pr_calculator = page_rank_util.PRCalculator()
     self.url_parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
Beispiel #8
0
 def __init__(self, tp, seed):
     self.tp = tp
     self.seed = seed
     self.visited = set()
     self.html_outputer = html_outputer.HtmlOutputer()
     self.html_parser = html_parser.HtmlParser()
     self.html_downloader = html_downloader.HtmlDownloader()
 def __init__(self,path=""):
     self.urls = url_manager.UrlManager(path)
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.root_url = "http://baike.baidu.com/view/21087.htm"
     self.path = path
     self.new = True if len(path)==0 else False
Beispiel #10
0
 def __init__(self):
     # html页面下载器
     # html页面分析器
     self.parser = html_parser.HtmlParsesr()
     # 爬取数据输出器
     self.outputer = html_outputer.HtmlOutputer()
     # 数据库url管理器器
     self.database = database.Database()
Beispiel #11
0
 def __init__(self):
     """初始化
     建立四个模块的实例
     """
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.download_list = [
         'tv', 'korean_tv', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu'
     ]
Beispiel #13
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_request.HtmlRequests()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.download_list = [
         'gq_movie', 'hao6_mj', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu'
     ]
Beispiel #14
0
def main():
    list = html_outputer.HtmlOutputer().output_html('', '全国', '不限', '不限')
    return render_template('main.html',
                           list=list,
                           keyword='',
                           location='全国',
                           rule='不限',
                           school='不限')
Beispiel #15
0
 def __init__(self, isuse, connection):
     self.config = config
     self.connection = connection
     self.urls = url_manager.UrlManager(connection, isuse)
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParse()
     self.outputer = html_outputer.HtmlOutputer()
     self.imgdownloader = img_downloader.ImgDownloader()
Beispiel #16
0
    def __init__(self):
        # 初始化所需要的对象,包括url管理器,网页下载器,网页解析器,输出器
        # 来提供给craw()使用

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()
Beispiel #17
0
 def __init__(self):
     self.urls = url_manager.UrlManager(
     )  # url管理器 管理2个集合 分别存放已抓取的url和待抓取的url
     # 提供4个方法:has_new_url, get_new_url, add_new_url, add_new_urls
     self.downloader = html_downloader.HtmlDownloader()  #下载器
     # 提供 1个方法download(url): 给定url返回字符串
     self.parser = html_parser.HtmlParser()  # html页面解析器
     # 提供1个方法parse(new_url, html_cont) 返回页面解析得到的urls和data
     self.outputer = html_outputer.HtmlOutputer()  # 输出器
Beispiel #18
0
 def __init__(self):
     # urls 作为管理器
     self.urls = url_manager.UrlManager()
     # downloader作为下载器
     self.downloader = html_downloader.HtmlDownloader()
     # parser作为解析器
     self.parser = html_parser.HtmlParser()
     # outputer 将数据处理好的数据写出到 html 的页面
     self.outputer = html_outputer.HtmlOutputer()
Beispiel #19
0
 def __init__(self):
     # url管理器
     self.urls = url_manager.UrlManager()
     # 下载器
     self.downloader = html_downloader.HtmlDownloader()
     # 解析器
     self.parser = html_parser.HtmlParse()
     # 输出器        
     self.outputer = html_outputer.HtmlOutputer()
Beispiel #20
0
 def __init__(self):
     super(SpiderMain, self).__init__()
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.uploader = data_uploader.DataUploader()
     self.localDataManager = local_data_manager.LocalDataManager()
     self.pdfDealer = fang_main.PDF_Data_Dealer(self.localDataManager)
Beispiel #21
0
 def __init__(self):
     #URL管理器
     self.urls = url_manager.UrlManager()
     #HTML下载器
     self.downloader = html_downloader.HtmlDownloader()
     #HTML解析器
     self.parser = html_parser.HtmlParser()
     #HTML输出器
     self.outputer = html_outputer.HtmlOutputer()
Beispiel #22
0
 def __init__(self):
     # Url manager
     self.urls = url_manager.UrlManager()
     # Url downloader
     self.downloader = html_downloader.HtmlDownloader()
     # Url parser
     self.parser = html_parser.HtmlParser()
     # Url output device
     self.outputer = html_outputer.HtmlOutputer()
Beispiel #23
0
    def __init__(self):  #初始化各个模块.

        # 定义url管理器的对象
        self.urls = url_manager.UrlManager()
        # html下载器
        self.downloader = html_downloader.HtmlDownloader()
        # html解析器
        self.parser = html_parser.HtmlParser()
        # html输出器
        self.outputer = html_outputer.HtmlOutputer()
Beispiel #24
0
    def __init__(self):
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()
        self.links = set()

        input1 = open('./link_set.txt', 'r')
        for line in input1:
            line = line.strip('\n')
            self.links.add(line)
Beispiel #25
0
 def __init__(self):
     # 初始化url调度器
     self.urls = url_manager.UrlManager()
     # 初始化下载器
     self.downloader = html_downloader.HtmlDownloader()
     # 初始化解析器
     self.parser = html_parser.Parser()
     # 内容输出器
     self.outputer = html_outputer.HtmlOutputer()
     # 内容存储器
     self.saver = db_saver.DBSaver()
Beispiel #26
0
 def __init__(self):
     # URL管理器
     self.urls = url_manager.UrlManager()
     # 网页下载器
     self.downloader = html_downloader.HtmlDownloader()
     # 数据提取器
     self.parser = html_parser.HtmlParser()
     # 数据处理器
     self.outputer = html_outputer.HtmlOutputer()
     # 云图生成器
     self.cloud = word_cloud.Wordcloud()
Beispiel #27
0
 def __init__(self, xml_file_name):
     #URL管理器
     self.urls = url_manager.UrlManager()
     #URL下载器
     self.downloader = html_downloader.HtmlDownloader()
     #URL解析器
     self.parser = html_parser.HtmlParser()
     #URL输出器
     self.outputer = html_outputer.HtmlOutputer()
     #XML解析器
     self.reader = xml_reader.xmlReader(xml_file_name)
Beispiel #28
0
    def __init__(self):
        '''
        构造函数初始化注册,包括
        URL 管理器
        网页下载器
        网页解析器    
        '''

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()
Beispiel #29
0
def signin():
    # 需要从request对象读取表单内容:
    list = html_outputer.HtmlOutputer().output_html(request.form['keyword'],
                                                    request.form['location'],
                                                    request.form['rule'],
                                                    request.form['school'])
    return render_template('main.html',
                           list=list,
                           keyword=request.form['keyword'],
                           location=request.form['location'],
                           rule=request.form['rule'],
                           school=request.form['school'])
Beispiel #30
0
    def __init__(self, parent, driver, entry, num, status_text):
        self.parent = parent
        self.entry = entry
        self.num = num
        self.driver = driver
        self.status_text = status_text

        self.urls = url_manager.UrlManager()
        self.fielt_urls = set()

        self.downloader = html_downloader.HtmlDownloader()
        self.outputer = html_outputer.HtmlOutputer()
        self.parser = html_parser.HtmlParser()