Esempio n. 1
0
class SpiderMan:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url, content)
        # 构造一个获取评分和票房链接
        for url in urls:
            try:
                t = time.strftime('%Y%m%d%H%M%S3282', time.localtime())
                rank_url = ('http://service.library.mtime.com/Movie.api'
                            '?Ajax_CallBack=true'
                            '&Ajax_CallBackType=Mtime.Library.Service'
                            '&Ajax_CallBackMethod=GetMovieOverviewRating'
                            '&Ajax_CrossDomain=1'
                            '&Ajax_RequestUrl=%s'
                            '&t=%s'
                            '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]))
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print(e, 'Crawl failed')
        self.output.output_end()
        print('Crawl finish')
Esempio n. 2
0
class SpiderMan(object):
    def __init__(self):
        self.download = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        url_host = 'https://movie.douban.com{}'
        i = 2
        while True:
            content = self.download.download(root_url)
            html = etree.HTML(content)
            max_link = int(''.join(
                html.xpath(".//span[@data-total-page]/@data-total-page")))
            next_link = ".//div[@class='paginator']/a[{}]/@href".format(i)
            next_links = ''.join(html.xpath(next_link))
            details = html.xpath(".//div[@class='main-bd']/h2/a/@href")
            for detail in details:
                text = self.download.download(detail)
                datas = self.parser.parser(text)
                self.output.store_data(datas)
            root_url = url_host.format(next_links)
            i += 1
            if i > max_link:
                break
        self.output.output_end()
Esempio n. 3
0
class SpiderMan(object):
    """docstring for SpiderMan"""
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口URL
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                #HTML下载器下载网页
                html = self.downloader.download(new_url)
                #HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                #将抽取的url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                #数据存储器存储文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
        #数据存储器将文件输出成指定格式
        self.output.output_html()
Esempio n. 4
0
 def __init__(self):
     '''
     初始化各类
     '''
     self.UM = UrlManager.UrlManager()
     self.HP = HtmlParser.HtmlParser()
     self.DA = DataArranger.DataArrange()
     self.driver = webdriver.Chrome()
Esempio n. 5
0
 def __init__(self):
     '''
     初始化各模块
     '''
     self.DA = DataArranger.DataArranger()
     self.HD = HtmlDownloader.HtmlDownloader()
     self.HP = HtmlParser.HtmlParser()
     self.UM = UrlManager.UrlManager()
	def getNumbers(self):
		self["tituloLabel"].text = _("Getting services numbers...")

		numeros=parser.getCanais(self.selecionado)
		if not numeros:
			self.session.open(MessageBox, text = _("There was a problem to access the %s!")%parser._urlPadrao+"/"+self.selecionado[1], type = MessageBox.TYPE_ERROR,close_on_any_key=True, timeout=5)
			self.cancel()
		else:
			self.session.openWithCallback(self.selecaoCallback, DuvidasCanaisScreen, numeros)
Esempio n. 7
0
def main():
    #浙江大学
    # school = "浙江大学"
    # url = 'https://baike.baidu.com/item/%E6%B5%99%E6%B1%9F%E5%A4%A7%E5%AD%A6'
    #东北大学
    # school = "东北大学"
    # url = 'https://baike.baidu.com/item/%E4%B8%9C%E5%8C%97%E5%A4%A7%E5%AD%A6/18014'

    # 高玉堂
    # name = "高玉堂"
    # url = 'http://xueshu.baidu.com/scholarID/CN-BT73WSNJ'

    name = "scholar"
    i = 0
    # url_list
    url_list = []
    url_list.append('http://xueshu.baidu.com/scholarID/CN-BT73WSNJ')
    url_list.append('http://xueshu.baidu.com/scholarID/CN-B3742FWJ')
    url_list.append('http://xueshu.baidu.com/scholarID/CN-B0746Q8J')
    url_list.append('http://xueshu.baidu.com/scholarID/CN-B97472MJ')
    url_list.append('http://xueshu.baidu.com/scholarID/CN-BN733MNJ')

    download = HtmlDownloader()
    parser = HtmlParser()

    for item in url_list:

        url = item
        # download = HtmlDownloader()
        # parser = HtmlParser()
        html_cont = download.download(url)
        # data, table = parser.parser(url, html_cont)
        data = parser.parser(url, html_cont, url_list)

        # output_dict(school + "out_dict.txt", data)
        # output_table(school + "out_table.txt", table)
        # return

        output_dict(name + str(i) + "out_dict.txt", data)
        # output_table(name + "out_table.txt", table)
        i += 1
        # time.sleep(1)

    return
Esempio n. 8
0
    def __init__(self, Response):
        '''
		'''
        try:
            parser = HtmlParser.HtmlParser(Response)
        except:
            msg = 'There is no parser for "%s".' % Response.get_url()
            raise msg

        self._parser = parser
Esempio n. 9
0
 def __init__(self):
     # 初始化分布式进程中工作节点的链接工作
     # 实现第一步: 使用BaseManager注册用于获取Queue的方法名称
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 实现第二步: 连接到服务器
     server_addr = '127.0.0.1'
     print('Connect to server %s...' % server_addr)
     # 注意保持端口和验证口令与服务器进程设置的完全一致
     self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
     # 从网络连接
     self.m.connect()
     # 实现第三步: 获取Queue的对象
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     # 初始化网页下载器和解析器
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish.')
Esempio n. 10
0
 def __init__(self):
     # 爬取深度(页数)
     self.maxPageDeep = 1
     # 地址管理器
     self.UrlsManager = UrlManager.UrlManager()
     # 下载器
     self.Downloader = HtmlDownloader.HtmlDownloader()
     # 解析器
     self.Parser = HtmlParser.HtmlParser()
     # 输出器
     self.Outputer = HtmlOutputer.HtmlOutputer()
	def getOpcoes(self):
		self.menuList=parser.getOpcoes()
		if not self.menuList:
			self.session.open(MessageBox, text = _("There was a problem to access the %s!")%parser._urlPadrao, type = MessageBox.TYPE_ERROR,close_on_any_key=True, timeout=5)
			self.cancel()
		else:
			self.clearSelection()

			self.timerMsg=eTimer()
			self.timerMsg.callback.append(self.trocaMensagem)
			self.timerMsg.start(100,True)
Esempio n. 12
0
def crawl():
    try:
        global count, mutex
        if mutex.acquire():
            count += 1
            new_url = url.get_new_url()
            print('正在爬第' + str(count) + '条:' + new_url)
            mutex.release()
            html = downloader.download(new_url)
            url_list = parser.parser(html)
            url.add_new_urls(url_list)
    except:
        print('未知异常')
Esempio n. 13
0
    def __init__(self):
        #self.login = login()
        self.urls = UrlManager.url_manager()
        self.downloader = HtmlDownloader.htmldownloader()
        self.parser = HtmlParser.htmlparser()
        self.imgdownloader = ImgDownloader.imgdownloader()

        self.url_list = self.get_url_list()
        self.url_list_num = len(self.url_list)
        self.url_list_cnt = 0

        self.img_list = None
        self.img_list_num = 0
        self.img_list_cnt = 0
Esempio n. 14
0
 def __init__(self):
     #初始化分布式进程中的工作节点的连接工作
     # 实现第一步:使用BaseManager注册获取Queue的方法名称
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 实现第二步:连接到服务器:
     server_addr = '127.0.0.1'
     print('Connect to server %s...' % server_addr)
     # 端口和验证口令注意保持与服务进程设置的完全一致:
     self.m = BaseManager(address=(server_addr, 8001), authkey='baike'.encode('utf-8'))
     # 从网络连接:
     self.m.connect()
     # 实现第三步:获取Queue的对象:
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     #初始化网页下载器和解析器
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
     print('init finish')
Esempio n. 15
0
class SpiderWork:
    def __init__(self):
        # 初始化分布式进程中工作节点的链接工作
        # 实现第一步: 使用BaseManager注册用于获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 实现第二步: 连接到服务器
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        # 注意保持端口和验证口令与服务器进程设置的完全一致
        self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
        # 从网络连接
        self.m.connect()
        # 实现第三步: 获取Queue的对象
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        # 初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish.')

    def crawl(self):
        while True:
            try:
                if not self.task.emtpy():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其它节点停止工作
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    print('爬虫节点正在解析: %s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({'new_urls': new_urls, 'data': data})
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('Crawl fail.')
Esempio n. 16
0
 def __init__(self):
     self.urls = UrlManager.UrlManager()
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
     self.outputer = HtmlOutputer.HtmlOutputer()
class TestHtmlParser(unittest.TestCase):

    #Setup an instance of HtmlParser to test
    def setUp(self):
        html = raw_input("Html: ")
        self.htmlParser = HtmlParser(html)

    #Test the various parsing functions
        
    def test_idenParser(self):
        assert self.htmlParser.idenParser() == 89551
        
    def test_titleParser(self):
        assert self.htmlParser.titleParser() == 'GNOME clutter Unmasked Password Field Cleartext Credential Disclosure'

    def test_viewsParser(self):
        self.htmlParser.viewsParser() == 43
        
    def test_dDParser(self):
        assert self.htmlParser.dDParser() == '2012-10-06'

    def test_descriptionParser(self):
        assert self.htmlParser.descriptionParser() == "GNOME contains a flaw in clutter that may lead to unauthorized disclosure of potentially sensitive information. The issue is due to the program displaying the unmasked password field in cleartext. This may allow a physically proximate attacker to gain access to credential information when looking at a user's screen."

    def test_locationParser(self):
        assert self.htmlParser.locationParser() == 'Physical Access Required'

    def test_attackTypeParser(self):
        assert self.htmlParser.attackTypeParser() == 'Cryptographic,Information Disclosure'

    def test_impactParser(self):
        assert self.htmlParser.impactParser() == 'Loss of Confidentiality'

    def test_solutionParser(self):
        assert self.htmlParser.solutionParser() == 'Patch / RCS'

    def test_exploitParser(self):
        assert self.htmlParser.exploitParser() == 'Exploit Public'

    def test_disclosureParser(self):
        assert self.htmlParser.disclosureParser() == 'Vendor Verified'

    def test_specSolutionParser(self):
        assert self.htmlParser.specSolutionParser() == 'The vendor has released a patch to address this vulnerability. There are no known workarounds or upgrades to correct this issue. Check the vendor advisory, changelog, or solution in the references section for details.'

    def test_creditParser(self):
        assert self.htmlParser.creditParser() == 'Alejandro Piñeiro Iglesias'

    def test_cvssAccessVecParser(self):
        assert self.htmlParser.cvssAccessVecParser() == None

    def test_authenticationParser(self):
        assert self.htmlParser.authenticationParser() == None

    def test_confidentialityParser(self):
        assert self.htmlParser.confidentialityParser() == None

    def test_integrityParser(self):
        assert self.htmlParser.integrityParser() == None

    def test_availabilityParser(self):
        assert self.htmlParser.availabilityParser() == None

    def test_baseScoreParser(self):
        assert self.htmlParser.baseScoreParser() == None
    
    def tearDown(self):
        pass
Esempio n. 18
0
 def __init__(self):
     self.downloader=downloader.Downloader()
     self.outputer=FileOutputer.FileOutputer()
     self.parser=HtmlParser.HtmlParser()
Esempio n. 19
0
 def __init__(self):
     self.manager = UrlManager.UrlManager()
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
     self.output = DataOutput.DataOutput()
Esempio n. 20
0
 def __init__(self):
     
     self.UM = UM.UrlManager()
     self.HD = HD.HtmlDownloader()
     self.HP = HP.HtmlParser()
     self.DA = DA.DataArrange()
Esempio n. 21
0
 def __init__(self):
     self.urlManager = UrlManager.UrlManager()
     self.htmlDownloader = HtmlDownloader.HtmlDownloader()
     self.htmlParser = HtmlParser.HtmlParser()
     self.fileOutputer = FileOutputer.FileOutputer()
 def setUp(self):
     html = raw_input("Html: ")
     self.htmlParser = HtmlParser(html)
Esempio n. 23
0
def fetchPage(fetchUrl):
    f = urllib.urlopen(fetchUrl)
    content = f.read()
    f.close()
    return content


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print "Please specify url and convert method"
        sys.exit(1)
    targetUrl = sys.argv[1]
    content = fetchPage(sys.argv[1])
    if not content:
        sys.exit(0)

    if (targetUrl.find("weixin") != -1):
        handler = HtmlParser.WeixinParser(content, sys.argv[2])
    elif (targetUrl.find("buzzfeed") != -1):
        handler = HtmlParser.BuzzFeedParser(content, sys.argv[2])
    else:
        print "No existed parser implementation"
        sys.exit(1)

    handler.process()

    print handler.getTitle()
    print handler.getDate()
    print handler.getContent() + suffixString
Esempio n. 24
0
 def __init__(self):
     super(SpiderMain, self).__init__()
     self.urls = UrlManager.UrlManager()
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
     self.outputer = HtmlOutputer.HtmlOutputer()
Esempio n. 25
0
 def __init__(self):
     self.manager = UrlManager.UrlManager()
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
Esempio n. 26
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
# coding=utf-8
import IOUtils
import HtmlDownloader
import HtmlParser
import HtmlOutputer
import bd

L=[]
hd = HtmlDownloader.HtmlDownloader()
hp = HtmlParser.HtmlParser()
io =IOUtils.IOUtils()
ho = HtmlOutputer.HtmlOutputer()
datas = io.getListOrDictFromJsonFile("F:\\ajk\\info.json")
print len(datas)
L_res=[]
for data in datas:
    name = data[u'小区名称']
    print name
    #下面这一步是为了url编码
    name = name.encode('utf-8') 
    url = data[u'网站']

    d = bd.getPos(name)
    if d is None:
        data[u'精度']=u"未找到"
        data[u'纬度']=u"未找到"
        data[u'附件500米幼儿园']=u"未找到"
        data[u'附近3000米幼儿园']=u"未找到"
        data[u'附近500米医院']=u"未找到"
        data[u'附近3000米医院']=u"未找到"
        data[u'附近500米商场']=u"未找到"