def main(): global SHARE_Q global ACTIVE_Q threads = [] session = loadSession() proxies = session.query(Proxy).filter(Proxy.type == "HTTP").order_by( Proxy.indate.desc()).limit(20000) # 向队列中放入任务 for proxy in proxies: SHARE_Q.put(proxy) #控制线程数量 for i in xrange(_WORKER_THREAD_NUM): thread = MyThread(worker) thread.start() threads.append(thread) for thread in threads: thread.join() #当队列ACTIVE_Q中的item不为空时循环执行checkValid() while not ACTIVE_Q.empty(): item = ACTIVE_Q.get() checkValid(item)
def checkValid(item): starttime = datetime.datetime.now() rst = checkProxy(proxyIP=item.ip_port, protocol="http", timeout=5) costtimie = (datetime.datetime.now() - starttime).seconds if rst is not None and rst["status"] == "ok": proxy = freshProxy(ip_port=item.ip_port, type=item.type, location=rst["rstLocation"].encode("utf-8"), speed=costtimie, source=item.source, rule_id=item.rule_id, lastcheck=datetime.datetime.now()) print rst["rstIP"] print rst["rstLocation"].encode("utf-8") session = loadSession() try: session.merge(proxy) session.commit() except MySQLdb.IntegrityError, e: print e.message
from scrapy.settings import Settings from main.spiders import config from main.spiders.model.rules import Rule from main.spiders.model import loadSession from main.spiders.proxy_spider import ProxySpiderSpider settings = Settings() settings.set("ITEM_PIPELINES" ,config.ITEM_PIPELINES) settings.set("DEFAULT_REQUEST_HEADERS",config.DEFAULT_REQUEST_HEADERS) settings.set("DOWNLOADER_MIDDLEWARES",config.DOWNLOADER_MIDDLEWARES) settings.set("DOWNLOAD_DELAY",config.DOWNLOAD_DELAY) settings.set("COOKIES_ENABLED",config.COOKIES_ENABLED) settings.set("ROBOTSTXT_OBEY",config.ROBOTSTXT_OBEY) process = CrawlerProcess(settings) session=loadSession() rules = session.query(Rule).filter(Rule.enable == 1) for rule in rules: print rule.id process.crawl(ProxySpiderSpider,rule) process.start()
def deleteProxy(item): session = loadSession() session.query(Proxy).filter(Proxy.ip_port == item.ip_port).delete() session.commit()