Ejemplo n.º 1
0
        resp = requests.get(url)
        if 200 == resp.status_code:
            logging.info('Start Crawl kuai: %s', url)
            bs = BeautifulSoup(resp.text, 'html.parser')
            for t_body in bs.find_all('tbody'):
                for tr in t_body.find_all('tr'):
                    td_list = tr.find_all('td')
                    ip, port, proxy_type = td_list[0].text, td_list[
                        1].text, td_list[3].text
                    judgeProxy(ip, port, proxy_type.lower())


def cron_crawl_proxy():
    """定时爬取快代理数据"""

    logging.info("Cron crawl kuai proxy")
    # 对应高匿代理,普通代理,https代理和http代理
    crawl_list = ['inha', 'intr']
    multi_pool = multiprocessing.Pool(len(crawl_list))
    multi_pool.map(crawlProxy, crawl_list)
    multi_pool.close()


if __name__ == '__main__':
    # 定时爬取
    days, hours = '0-6', '0-23'
    cron.cron_blocking(job=cron_crawl_proxy,
                       day_of_week=days,
                       hour=hours,
                       minute='45')
Ejemplo n.º 2
0

def handleProxy(keys):
    """检测代理是否有效"""

    for key in keys:
        for member in redis.smembers(key):
            proxy = {key: member}
            if member != checkProxy(proxy, timeout=5):
                logging.warning("Handle Proxy %s is invalid", member)
                redis.srem(key, member)


def cron_handle_proxy():
    """定时检测Redis代理数据"""

    logging.info("Cron handle proxy")
    keys = redis.keys('http*')
    multi_pool = multiprocessing.Pool(len(keys))
    multi_pool.map(handleProxy, keys)
    multi_pool.close()


if __name__ == '__main__':
    # 定时检测
    days, hours = '0-6', '0-23'
    cron.cron_blocking(job=cron_handle_proxy,
                       day_of_week=days,
                       hour=hours,
                       minute='30')
Ejemplo n.º 3
0
import sys
from utils import cron
from scrapy.cmdline import execute

# 爬取数据
# execute(['scrapy', 'crawl', 'job', '-s', 'CLOSESPIDER_PAGECOUNT=10'])
# execute(['scrapy', 'crawl', 'job', '-s', 'CLOSESPIDER_ITEMCOUNT=5'])

# 判断过期
# execute(['scrapy', 'crawl', 'expire'])


def run():
    execute(['scrapy', 'crawl', 'job'])


if __name__ == '__main__':
    if len(sys.argv) == 1:
        cron.cron_blocking(job=run, day_of_week='4', hour='21')
    else:
        run()
Ejemplo n.º 4
0
import sys
from utils import cron
from scrapy.cmdline import execute


def run():
    execute('scrapy crawl job'.split(' '))


if __name__ == '__main__':
    if len(sys.argv) == 1:
        cron.cron_blocking(job=run, day_of_week='5')
    else:
        run()