Exemple #1
0
class SpiderManager(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        #with open("content.html", 'wb') as f:
        #    f.write(content.encode('utf-8'))
        urls = self.parser.parser_url(root_url, content)
        print(urls)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl Failed!!!')
        self.output.output_end()
        print('Crawl Finish!')
Exemple #2
0
 def start(self):
     self.url_manager.add_url(self.url)
     while self.url_manager.has_next():
         hd = HtmlDownloader(proxies=self.pool.get_proxy_ip())
         url = self.url_manager.get_url()
         data = hd.download(url)
         urls = self.parser.simple_tags(data, 'a', attributes=['href'])
         self.url_manager.add_urls([url_.get('href') for url_ in urls])
         title = self.parser.element(data, 'title')
         title = title.getText() if title else 'unknown'
         self.writer.load_data('[%s] %s' % (title, url))
     self.writer.writer()
Exemple #3
0
def spider():
    failed_list_urls = set()  # 下载失败的列表页
    failed_info_urls = set()  # 下载失败的漏洞详情页
    finished_info_urls = set()
    max_len = 100
    offset = 0
    downloader = HtmlDownloader()
    cnvd = CNVDOperator()
    finished_cnvd_ids = cnvd.get_all_cnvd_id()
    while True:
        if len(failed_list_urls) > 0:
            list_url = failed_list_urls.pop()
        else:
            list_url = url_generator(max_len, offset)
            offset += max_len

        html_content = downloader.download(list_url)
        if html_content:  # 页面下载成功
            info_urls = set(list_page_parser(html_content).get('info_urls'))
            info_urls.update(failed_info_urls)
            failed_info_urls.clear()

            if len(info_urls) == 0:  # 最后分页, 没有更多详情页
                break

            for info_url in info_urls:
                cur_cnvd_id = get_cnvd_id_from_url(info_url)
                if cur_cnvd_id in finished_cnvd_ids:
                    print('记录存在: ', cur_cnvd_id)
                    continue
                if info_url in finished_info_urls:  # 如果详情页面已经处理完成
                    continue
                sleep(REQUEST_DELAY)
                html_content = downloader.selenium_download(info_url)
                if html_content:  # 页面下载成功
                    try:
                        gg_detail = info_page_parser(html_content)  # 漏洞详情
                        cnvd.insert(meta_data=gg_detail)
                        finished_info_urls.add(info_url)
                        finished_cnvd_ids.add(cur_cnvd_id)
                    except Exception as e:
                        traceback.print_exc()
                        print('插入 ', cur_cnvd_id, ' 失败:', e)
                else:
                    failed_info_urls.add(info_url)
        else:
            failed_list_urls.add(list_url)
Exemple #4
0
    def __update_proxy_pool(self):
        downloader = HtmlDownloader()
        proxy_pool = ProxiesPool()
        parser = HtmlParser()
        data = downloader.download(self.proxy_site)
        speed_times = parser.multilevel_tags(data, [{'tr': None}, {'div': {'class': 'bar'}}])
        ip_data = parser.elements(data, 'tr')[1:]
        speed = speed_times[::2]
        times = speed_times[1::2]
        for i, ip in enumerate(ip_data):
            d = {}
            for j, value in enumerate(filter(lambda x: x, ip_data[i].text.split('\n'))):
                if j == 0:
                    d['ip'] = value
                elif j == 1:
                    d['port'] = value
                continue
            if len(d.keys()) != 2:
                continue
            if self.__re_number(speed[i].get('title')) > 1 \
                    or self.__re_number(times[i].get('title')) > 1:
                continue

            proxy_pool.add({'http': '%s:%s' % (d.get('ip'), d.get('port'))})
Exemple #5
0
class Crawler(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def get_urls(self, keywords):
        data = {}
        for word in keywords:
            url = self.crawl(word)
            data[word] = url

        return data

    def crawl(self, word):
        results = {}
        url = self.manager.get_url(word)
        page = self.downloader.download(url)

        return self.parser.search(page)
Exemple #6
0
class Crawler(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def get_urls(self, keywords):
        data = {}
        for word in keywords:
            url = self.crawl(word)
            data[word] = url;
        
        return data
    
    def crawl(self, word):
        results = {}
        url = self.manager.get_url(word);
        page = self.downloader.download(url)
        
        return self.parser.search(page)
Exemple #7
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
Exemple #8
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
Exemple #9
0
 def __init__(self):
     self.downloader = HtmlDownloader()
Exemple #10
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()