def __init__(self, count, urls): threading.Thread.__init__(self) self.count = count self.urls = urls self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): super(SpiderMain, self).__init__() self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): #第一個是模塊的名字,第二個是模塊內部對象的名字 #html_download.HtmlDownload() self.html_download = html_download3.HtmlDownload() self.html_parser = HtmlParse.HtmlParse() self.urlmanage = UrlManage.UrlManage() self.html_output = html_outputer.HtmlOutputer()
def __init__(self): # 初始化配置 cf = ConfigParser.ConfigParser() cf.read("config.conf") self.projectid = '%s' % cf.get("start", "project_id") self.root_url = '%s' % cf.get("start", "root_url") self.number = '%s' % cf.get("start", "number") # 启动表连接 db = mysqldbhand() db.dbconnect() db.init_tables(self.projectid) project = db.FindAll('project', '*', where='id= %s' % (self.projectid)) project_field = db.FindAll('project_field', '*', where='pid= %s' % (self.projectid)) self.tablename = project[0][2] + '_content' # 加载URL管理器 self.urls = url_manager.UrlManager(self.tablename) # 加载下载器 self.downloader = html_downloader.HtmlDownloader() # 加载页面解析器 self.parse = html_parser.HtmlParser(self.tablename, project, project_field) # 加载入库程序 self.outputer = html_outputer.HtmlOutputer(self.tablename)
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() # self.targetKeywords = ['B-box', 'beatbox', 'bbox', 'Beatbox'] self.targetKeywords = ['三峡']
def __init__(self,xing,ming): self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.xing=xing self.ming=ming self.param={'origin':'searchauthorlookup', 'src':'al', 'edit':'', 'poppUp':'', 'basicTab':'', 'affiliationTab':'', 'advancedTab':'', 'st1':xing, 'st2':ming, 'institute':'', '_exactSearch':'on', 'orcidId':'', #'authSubject':'LFSC', '_authSubject':'on', #'authSubject':'HLSC', '_authSubject':'on', #'authSubject':'PHSC', '_authSubject':'on', #'authSubject':'SOSC', '_authSubject':'on', 's':'AUTH--LAST--NAME({0}) AND AUTH--FIRST({1})'.format(ming,xing), 'sdt':'al', 'sot':'al', #'searchId':sid, #'sid':sid }
def __init__(self): self.url_manager = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.google_fetcher = html_google_fetcher.HtmlGoogleParser() self.pr_calculator = page_rank_util.PRCalculator() self.url_parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self, tp, seed): self.tp = tp self.seed = seed self.visited = set() self.html_outputer = html_outputer.HtmlOutputer() self.html_parser = html_parser.HtmlParser() self.html_downloader = html_downloader.HtmlDownloader()
def __init__(self,path=""): self.urls = url_manager.UrlManager(path) self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.root_url = "http://baike.baidu.com/view/21087.htm" self.path = path self.new = True if len(path)==0 else False
def __init__(self): # html页面下载器 # html页面分析器 self.parser = html_parser.HtmlParsesr() # 爬取数据输出器 self.outputer = html_outputer.HtmlOutputer() # 数据库url管理器器 self.database = database.Database()
def __init__(self): """初始化 建立四个模块的实例 """ self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.download_list = [ 'tv', 'korean_tv', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu' ]
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_request.HtmlRequests() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.download_list = [ 'gq_movie', 'hao6_mj', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu' ]
def main(): list = html_outputer.HtmlOutputer().output_html('', '全国', '不限', '不限') return render_template('main.html', list=list, keyword='', location='全国', rule='不限', school='不限')
def __init__(self, isuse, connection): self.config = config self.connection = connection self.urls = url_manager.UrlManager(connection, isuse) self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParse() self.outputer = html_outputer.HtmlOutputer() self.imgdownloader = img_downloader.ImgDownloader()
def __init__(self): # 初始化所需要的对象,包括url管理器,网页下载器,网页解析器,输出器 # 来提供给craw()使用 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager( ) # url管理器 管理2个集合 分别存放已抓取的url和待抓取的url # 提供4个方法:has_new_url, get_new_url, add_new_url, add_new_urls self.downloader = html_downloader.HtmlDownloader() #下载器 # 提供 1个方法download(url): 给定url返回字符串 self.parser = html_parser.HtmlParser() # html页面解析器 # 提供1个方法parse(new_url, html_cont) 返回页面解析得到的urls和data self.outputer = html_outputer.HtmlOutputer() # 输出器
def __init__(self): # urls 作为管理器 self.urls = url_manager.UrlManager() # downloader作为下载器 self.downloader = html_downloader.HtmlDownloader() # parser作为解析器 self.parser = html_parser.HtmlParser() # outputer 将数据处理好的数据写出到 html 的页面 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # url管理器 self.urls = url_manager.UrlManager() # 下载器 self.downloader = html_downloader.HtmlDownloader() # 解析器 self.parser = html_parser.HtmlParse() # 输出器 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): super(SpiderMain, self).__init__() self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.uploader = data_uploader.DataUploader() self.localDataManager = local_data_manager.LocalDataManager() self.pdfDealer = fang_main.PDF_Data_Dealer(self.localDataManager)
def __init__(self): #URL管理器 self.urls = url_manager.UrlManager() #HTML下载器 self.downloader = html_downloader.HtmlDownloader() #HTML解析器 self.parser = html_parser.HtmlParser() #HTML输出器 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # Url manager self.urls = url_manager.UrlManager() # Url downloader self.downloader = html_downloader.HtmlDownloader() # Url parser self.parser = html_parser.HtmlParser() # Url output device self.outputer = html_outputer.HtmlOutputer()
def __init__(self): #初始化各个模块. # 定义url管理器的对象 self.urls = url_manager.UrlManager() # html下载器 self.downloader = html_downloader.HtmlDownloader() # html解析器 self.parser = html_parser.HtmlParser() # html输出器 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.links = set() input1 = open('./link_set.txt', 'r') for line in input1: line = line.strip('\n') self.links.add(line)
def __init__(self): # 初始化url调度器 self.urls = url_manager.UrlManager() # 初始化下载器 self.downloader = html_downloader.HtmlDownloader() # 初始化解析器 self.parser = html_parser.Parser() # 内容输出器 self.outputer = html_outputer.HtmlOutputer() # 内容存储器 self.saver = db_saver.DBSaver()
def __init__(self): # URL管理器 self.urls = url_manager.UrlManager() # 网页下载器 self.downloader = html_downloader.HtmlDownloader() # 数据提取器 self.parser = html_parser.HtmlParser() # 数据处理器 self.outputer = html_outputer.HtmlOutputer() # 云图生成器 self.cloud = word_cloud.Wordcloud()
def __init__(self, xml_file_name): #URL管理器 self.urls = url_manager.UrlManager() #URL下载器 self.downloader = html_downloader.HtmlDownloader() #URL解析器 self.parser = html_parser.HtmlParser() #URL输出器 self.outputer = html_outputer.HtmlOutputer() #XML解析器 self.reader = xml_reader.xmlReader(xml_file_name)
def __init__(self): ''' 构造函数初始化注册,包括 URL 管理器 网页下载器 网页解析器 ''' self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def signin(): # 需要从request对象读取表单内容: list = html_outputer.HtmlOutputer().output_html(request.form['keyword'], request.form['location'], request.form['rule'], request.form['school']) return render_template('main.html', list=list, keyword=request.form['keyword'], location=request.form['location'], rule=request.form['rule'], school=request.form['school'])
def __init__(self, parent, driver, entry, num, status_text): self.parent = parent self.entry = entry self.num = num self.driver = driver self.status_text = status_text self.urls = url_manager.UrlManager() self.fielt_urls = set() self.downloader = html_downloader.HtmlDownloader() self.outputer = html_outputer.HtmlOutputer() self.parser = html_parser.HtmlParser()