def delete_proxy(ip, port): """ 删除数据库中的代理 :param ip: ip :param port: 端口 """ if ip != "" and port != "": db = DBSession() db.query(Proxy).filter(Proxy.ip == ip).filter(Proxy.port == port).delete() try: db.commit() return True except exc.SQLAlchemyError, e: logging.info("Delete Proxy Error:", format(e)) return False
def delete_proxy(ip, port): """ 删除数据库中的代理 :param ip: ip :param port: 端口 """ if ip != "" and port != "": db = DBSession() db.query(Proxy).filter(Proxy.ip == ip).filter( Proxy.port == port).delete() try: db.commit() return True except exc.SQLAlchemyError, e: logging.info("Delete Proxy Error:", format(e)) return False
def process_item(self, item, spider): db = DBSession() redis = confRedis rule_id = spider.rule_id url = item['url'] md5 = hashlib.md5() md5.update(url) urlmd5 = md5.hexdigest() site_name = utils.get_site(item['url']) # site_name = spider.rule['allow_domains'] html_title = item['html_title'] # html_body = item['html_body'] save_path = utils.md5dir(item['url']) save_time = int(time.time()) title = item['title'] if 'title' in item else "" body = item['body'] if 'body' in item else "" thumb = item['thumb'] if 'thumb' in item else "" img_list = item['img_list'] if 'img_list' in item else "" # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳 publish_time = utils.smart2date(item['publish_time']) if 'publish_time' in item else "" source_site = item['source_site'] if 'source_site' in item else "" flag = default_page_flag page = Page(rule_id=rule_id, url=item['url'], urlmd5=urlmd5, site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) has = db.query(Page).filter(Page.urlmd5 == urlmd5).first() if has: page = Page(rule_id=rule_id, url=item['url'], site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) db.add(page) try: db.commit() utils.save_file('%s/%s' % (html_path, save_path), item['html_body']) redis.set('url:%s' % url, 1) except exc.SQLAlchemyError, e: raise DropItem("SaveDbError: %s,%s" % (url, format(e)))
def get_proxy_list(): """ 获取代理列表""" db = DBSession() proxy_list = db.query(Proxy).all() db.close() return proxy_list
# python # -*- coding: utf-8 -*- # from sqlalchemy import exc from conf.config import DBSession from util import WebProxy as webProxy from model.Proxy import Proxy db = DBSession() ips = db.query(Proxy).all() for item in ips: ret = webProxy.check_proxy(item.ip, item.port) if not ret: webProxy.delete_proxy(item.ip, item.port)
def process_item(self, item, spider): db = DBSession() redis = confRedis rule_id = spider.rule_id url = item['url'] md5 = hashlib.md5() md5.update(url) urlmd5 = md5.hexdigest() site_name = utils.get_site(item['url']) # site_name = spider.rule['allow_domains'] html_title = item['html_title'] # html_body = item['html_body'] save_path = utils.md5dir(item['url']) save_time = int(time.time()) title = item['title'] if 'title' in item else "" body = item['body'] if 'body' in item else "" thumb = item['thumb'] if 'thumb' in item else "" img_list = item['img_list'] if 'img_list' in item else "" # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳 publish_time = utils.smart2date( item['publish_time']) if 'publish_time' in item else "" source_site = item['source_site'] if 'source_site' in item else "" flag = default_page_flag page = Page(rule_id=rule_id, url=item['url'], urlmd5=urlmd5, site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) has = db.query(Page).filter(Page.urlmd5 == urlmd5).first() if has: page = Page(rule_id=rule_id, url=item['url'], site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) db.add(page) try: db.commit() utils.save_file('%s/%s' % (html_path, save_path), item['html_body']) redis.set('url:%s' % url, 1) except exc.SQLAlchemyError, e: raise DropItem("SaveDbError: %s,%s" % (url, format(e)))