Exemple #1
0
def crawl_okaybuy(logger, store_name):
    job_rules = okaybuy.gen_job_rules(store_name)
    bee.run_job(job_rules, max_idle_cnt=3, job_status_interval=3, logger=logger)
Exemple #2
0
def crawl_taobao(logger, store_name):
    job_rules = taobao.gen_job_rules(store_name)
    bee.run_job(job_rules, max_idle_cnt=3, job_status_interval=3, logger=logger)
Exemple #3
0
def test_simple_crawling_job(logger):

    seed_url = "file://%s/demosite/index.html" % (os.path.abspath('.'),)

    sample_job_rules = {
        "desc": "Example job, crawing the test site, extract prod desc", 
        "name": "sample",
        "num_workers": 1,
        "worker_params": {
            "max_crawler_failed_cnt": 3, 
            "max_crawler_timeout": 30, 
            "crawler_retry_interval": 10, 
            "pause_on_reinject": 0.1, 
            "pause_before_fetch": 0, 
            "pause_when_notask": 0.1,
        },
        "linkdb": {
            "class_name": "bee.SqliteLinkDB",
            "params" : {
                "name": "sample_site.link.db",
            }
        },
        "task_queue": {
            "class_name": "bee.MemTaskQueue",
            "params": {
            },
        },
        "output": {
            "class_name": "bee.JsonDumper",
            "params": {
                "filename": "sample_site.out.json"
            }
        },
        "fetcher_factory": {
            "rules": {
                "simple_http_get": {
                    "class_name": "bee.SimpleHTTPFetcher",
                    "params": {
                        "timeout": 10,
                        "user_agent": "Bee: picking good stuffs",
                        "proxy_host": "",
                        "proxy_port": 0,
                        "from_encoding": "utf-8"
                    }
                }
            }
        },
        "seeker_factory": {
            "rules": {
                "simple_seek": {
                    "class_name": "bee.RuleBasedSeeker",
                    "params": {
                        "rules": [
                            [ ".*/cat\d+\.html", 3, 60, "simple_http_get", ["simple_seek"], [], False ],
                            [ ".*prod\d+\.html", 1, 60, "simple_http_get", [], ["simple_miner"], False ],
                        ],
                    }
                },
            }
        },
        "miner_factory": {
            "rules": {
                "simple_miner": {
                    "class_name": "ProductMiner",
                    "params": {
                    }
                },
            }
        },
        "seed_tasks": [
            {
                "url": seed_url, 
                "fetcher": "simple_http_get",
                "seekers": ["simple_seek"], 
                "miners": [],
                "hop": 0,
                "revisit_interval": 60,
            }
        ]
    }

    bee.run_job(sample_job_rules, max_idle_cnt=3, job_status_interval=1, logger=logger)
Exemple #4
0
def crawl_paixie(logger, store_name):
    job_rules = paixie.gen_job_rules(store_name)
    bee.run_job(job_rules, max_idle_cnt=3, job_status_interval=3, logger=logger)