Python Crawlerの例

プログラミング言語: Python

名前空間/パッケージ名: crawling.crawler

クラス/型: Crawler

hotexamples.comのコード掲載数: 10

Python Crawler - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcrawling.crawler.Crawlerの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Crawler(8)

crawl(3)

__init__(1)

get_home_topics(1)

get_topics_by_home_topic(1)

setMaxHitsPerMin(1)

setUserAgent(1)

コード例 #1

ファイルを表示

ファイル: crawler_acceptance_test.py プロジェクト: ralphavalon/avaloncrawler

    def test_crawl_with_csv(self):
        urlparse.urljoin = self.urljoin

        crawler = Crawler()
        epocaCosmeticos = EpocaCosmeticos()
        epocaCosmeticos.get_home_page = self.get_home_page
        epocaCosmeticos.get_product_pages = self.get_product_pages
        crawled_dict = crawler.crawl([epocaCosmeticos])

        base_path = os.path.abspath('.') + os.sep + 'tests'
        file_base_path = 'file:///' + base_path
        products = crawled_dict['EpocaCosmeticos']
        self.assertEquals(1, len(crawled_dict))
        self.assertEquals(2, len(products))
        self.assertEquals('Produto 1', products[0].name)
        self.assertEquals('Titulo do Produto 1', products[0].title)
        self.assertTrue(
            os.path.join(base_path, 'produto_1.html') in products[0].url)
        self.assertEquals('Produto 2', products[1].name)
        self.assertEquals('Titulo do Produto 2', products[1].title)
        self.assertTrue(
            os.path.join(base_path, 'produto_2.html') in products[1].url)

        exporter = CSVExporter()
        filename = ''
        for crawlable_name, exportable_list in crawled_dict.iteritems():
            exporter.export(crawlable_name, exportable_list)
            filename = exporter.__get_filename__(crawlable_name)

        self.assertTrue(os.path.getsize(filename) > 0)
        os.remove(filename)

コード例 #2

ファイルを表示

ファイル: main.py プロジェクト: LeoOrange/ZhihuCrawler

def get_all_topics():
    crawler = Crawler()
    home_topics = crawler.get_home_topics() or []
    logger.info(f'{len(home_topics)} home topics were found.')
    all_topics = []
    for ht in home_topics:
        topics = crawler.get_topics_by_home_topic(ht) or []
        logger.info(f'{len(topics)} topics were found under home topic {ht}.')
        all_topics.extend(topics)
        break # for debug
    return all_topics

コード例 #3

ファイルを表示

 def test_get_product(self):
     crawler = Crawler()
     base_path = os.path.abspath('.') + os.sep + 'tests'
     file_base_path = 'file:///' + base_path
     link = os.path.join(file_base_path, 'produto_1.html')
     epoca = EpocaCosmeticos()
     print epoca.get_product_pages()
     product = Page(EpocaCosmeticos(), link).get_product()
     self.assertEquals('Produto 1', product.name)
     self.assertEquals('Titulo do Produto 1', product.title)
     self.assertEquals(link, product.url)

コード例 #4

ファイルを表示

def main():
    redis = redis_init()
    mongo = Mongo()
    sync_redis_with_mongo(redis, mongo)
    crawler = Crawler()
    topics = get_all_topics(crawler, thread_num=4)
    batch_process_topics_data(redis,
                              mongo,
                              crawler,
                              topics,
                              batch=50,
                              thread_num=8)

コード例 #5

ファイルを表示

ファイル: main.py プロジェクト: LeoOrange/ZhihuCrawler

from __future__ import absolute_import
import logging

from utils.mongo import Mongo
from utils.toolkit import logging_init, redis_init
from crawling.crawler import Crawler

logging_init()
logger = logging.getLogger(__name__)


def get_all_topics():
    crawler = Crawler()
    home_topics = crawler.get_home_topics() or []
    logger.info(f'{len(home_topics)} home topics were found.')
    all_topics = []
    for ht in home_topics:
        topics = crawler.get_topics_by_home_topic(ht) or []
        logger.info(f'{len(topics)} topics were found under home topic {ht}.')
        all_topics.extend(topics)
        break # for debug
    return all_topics


if __name__ == '__main__':
    redis = redis_init()
    topics = get_all_topics()
    crawler = Crawler()
    topic_iter = map(crawler.get_topic_data, topics)

コード例 #6

ファイルを表示

ファイル: crawl.py プロジェクト: dimo414/pycrawl

    
    result_handler = logging.FileHandler(os.path.join(opts.log_dir, 'pycrawl.results.log'), 'w')
    result_handler.setFormatter(just_message_fmt)
    result_handler.setLevel(logging.INFO)
    logging.getLogger('result').addHandler(result_handler)
    
    err_handler = logging.StreamHandler(sys.stderr)
    err_handler.setLevel(logging.WARN)
    err_handler.setFormatter(logging.Formatter('%(name)-10s %(levelname)-8s  %(message)s'))
    logging.getLogger('').addHandler(err_handler)
    
    if opts.print_pages or opts.print_results:
      out_handler = logging.StreamHandler(sys.stdout)
      out_handler.setLevel(logging.INFO)
      out_handler.setFormatter(just_message_fmt)
      if opts.print_pages:
        logging.getLogger('page').addHandler(out_handler)
      if opts.print_results:
        logging.getLogger('result').addHandler(out_handler)

if __name__ == '__main__':
    """Sample crawl application.  Creates a crawler which will crawl local URLs and print URLs with links containing 'California'.
    Specify a custom user agent (Chrome 24 on Win7) and throttle requests to 10 per minute.  Then crawl the English Wikipedia, hitting
    the homepage and all pages linked to from that page, but no deeper."""
    configure(sys.argv)
    
    crawl = Crawler(test.basic.isLocal, action.basic.tagContains('a', 'California'))
    crawl.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17")
    crawl.setMaxHitsPerMin(10)
    crawl.crawl('http://en.wikipedia.org/wiki/Main_Page',1)

コード例 #7

ファイルを表示

 def __init__(self, queue_name='crawling'):
     self.amqp = Amqp(queue_name, ['database'])
     self.crawler = Crawler(self.amqp)
     self.amqp.receive(self)

コード例 #8

ファイルを表示

 def __init__(self):
     Crawler.__init__(self)
     self.db = DatabaseConnector()

コード例 #9

ファイルを表示

def crawl():
    sync_redis_with_mongo()
    crawler = Crawler()
    all_topic_dicts = get_all_topics(crawler, thread_num=4)
    batch_process_topics_data(crawler, all_topic_dicts, batch=50, thread_num=8)

コード例 #10

ファイルを表示

ファイル: start_crawling.py プロジェクト: ralphavalon/avaloncrawler

# coding: utf-8

from crawling.crawler import Crawler
from service.impl.epoca_cosmeticos import EpocaCosmeticos
from service.impl.csv_exporter import CSVExporter

crawler = Crawler()
crawled_dict = crawler.crawl([EpocaCosmeticos()], max_delay=5)

exporter = CSVExporter()

for crawlable_name, exportable_list in crawled_dict.iteritems():
    exporter.export(crawlable_name, exportable_list)