Ejemplo n.º 1
0
 def __init__(self):
     # 实例化需引用的对象
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownLoader()
     self.output = html_output.HtmlOutPut()
     self.parser = html_parser.HtmlParser()
     self.errorList = []
Ejemplo n.º 2
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownLoader()
     self.parser = html_parser.HTMLParser()
     self.ip_pool = ip_pool.IPPool()
     self.session = Session()
     self.ip_pools = self.ip_pool.get_ip_pools(self.session)
Ejemplo n.º 3
0
def get_goals(top_url):
    """
    获取百度风云榜明星名单
    """
    try:
        #top_url = 'http://top.baidu.com/buzz?b=3'
        headers = {
            "User-Agent":
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36"
        }
        downloader = html_downloader.HtmlDownLoader()
        html_content = downloader.download(top_url,
                                           retry_count=2,
                                           headers=headers)
    except:
        print('get top list failed')
    html_encode = 'utf-8'
    encodem = re.search('.+?charset=(\w+)\"',
                        html_content.decode(html_encode, errors='ignore'))
    if encodem is not None:
        html_encode = encodem.group(1)
    soup = BeautifulSoup(html_content.decode(html_encode, errors='ignore'),
                         "html.parser")
    links = soup.find_all("a", {'class': "list-title"},
                          href=re.compile('http.+?'))  #使用正则表达式查找
    linksname = [link.get_text() for link in links]
    return linksname
Ejemplo n.º 4
0
 def __init__(self, seed_url, user_agent='', managerQueue=None):
     self.urls = url_manager.UrlManagerFIFO(managerQueue)
     self.downloader = html_downloader.HtmlDownLoader()
     self.parser = html_parser.HtmlParser()
     self.out_put = jpg_output.jpgOutPut()
     self.rp = get_robots(seed_url)
     self.headers = {"User-Agent": user_agent}
     self.managerQueue = managerQueue
Ejemplo n.º 5
0
    def __init__(self):

        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownLoader()
        self.paser = html_paser.HtmlPaser()
Ejemplo n.º 6
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownLoader()
     self.parser = html_parser.HtmlParser()
     self.out_put = html_output.HtmlOutput()
Ejemplo n.º 7
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownLoader()
     self.parser = html_parser.HtmlParser()
     self.out_put = html_output.HtmlOutput()
     self.conn_mysql = connectmysql.ConnectMysql()
Ejemplo n.º 8
0
#coding=utf-8
#设置编码
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#获得系统编码格式
type = sys.getfilesystemencoding()

#引入相关模块
import url_manager, html_downloader, html_parser, html_outputer
#实例化相关模块
urls = url_manager.UrlManager()
downloader = html_downloader.HtmlDownLoader()
parser = html_parser.htmlParser()
outputer = html_outputer.htmlOutputer()


class SpiderMain(object):
    def __init__(self):  #初始化数据
        self.urls = urls
        self.downloader = downloader
        self.parser = parser
        self.outputer = outputer

    def craw(self, root_url):  #爬虫调度函数
        count = 1
        self.urls.add_new_url(root_url)  #将主地址加入到新的地址库里面
        while self.urls.has_new_url():  #判断是否有新的url地址
            try:
                new_url = self.urls.get_new_url()  #获取新的地址
                print 'craw %d : %s' % (count, new_url)
Ejemplo n.º 9
0
 def __init__(self):
     # URL 管理器
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownLoader()
     self.parse = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
Ejemplo n.º 10
0
 def __init__(self):
     self.urls = url_manager.UrlManager()  # url管理器
     self.downloader = html_downloader.HtmlDownLoader()  # 下载器
     self.parser = html_parser.HtmlParser()  # 解析器
     self.outputer = html_outputer.HtmlOutputer()  # 输出器
Ejemplo n.º 11
0
 def __init__(self):
     self.urlManager = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownLoader()
     self.parser = html_parser.HtmpParser()
     #self.output = output.Output()
     self.output = html_outputer.HtmlOutpter()
Ejemplo n.º 12
0
class SpiderMain (object):
    def __init__(self):
	    self.urls = url_manmager.UrlManager()
    	self.downloader = html_downloader.HtmlDownLoader()
    	self.parser = html_parser.HtmlParser()
	    self.outputer = html_outputer.HmtlOutputer()
Ejemplo n.º 13
0
# coding:utf8