Beispiel #1
0
 def process_item(self, item, spider):
     # try:
     #     con = MySQLdb.connect(**config.db_config)
     #     cur = con.cursor()
     #     sql = "INSERT INTO leiju_proxy (ip,port,proto,checked_at,created_at) VALUES (%s,%s,%s,%s,%s)"
     #     parmam = [(item['ip'], item['port'], 'http', int(time.time()), int(time.time()))]
     #     cur.executemany(sql, parmam)
     #     con.commit()
     #     con.close()
     #     # return item
     # except Exception, e:
     #     logging.info("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
     #     raise DropItem("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
     db = DBSession()
     md5 = hashlib.md5()
     md5.update(item['ip'] + "." + item['port'])
     haship = md5.hexdigest()
     proxy = Proxy(haship=haship,
                   ip=item['ip'],
                   port=item['port'],
                   create_time=int(time.time()))
     db.add(proxy)
     try:
         db.commit()
     except exc.SQLAlchemyError, e:
         raise DropItem("SaveError: %s:%s %s" %
                        (item['ip'], item['port'], format(e)))
Beispiel #2
0
def delete_proxy(ip, port):
    """ 删除数据库中的代理
    :param ip: ip
    :param port: 端口
    """
    if ip != "" and port != "":
        db = DBSession()
        db.query(Proxy).filter(Proxy.ip == ip).filter(
            Proxy.port == port).delete()
        try:
            db.commit()
            return True
        except exc.SQLAlchemyError, e:
            logging.info("Delete Proxy Error:", format(e))
            return False
Beispiel #3
0
def get_proxy_list():
    """ 获取代理列表"""
    db = DBSession()
    proxy_list = db.query(Proxy).all()
    db.close()
    return proxy_list
Beispiel #4
0
#  python
# -*- coding: utf-8 -*-
# from sqlalchemy import exc
from conf.config import DBSession
from util import WebProxy as webProxy
from model.Proxy import Proxy

db = DBSession()
ips = db.query(Proxy).all()

for item in ips:
    ret = webProxy.check_proxy(item.ip, item.port)
    if not ret:
        webProxy.delete_proxy(item.ip, item.port)

Beispiel #5
0
    def process_item(self, item, spider):
        db = DBSession()
        redis = confRedis

        rule_id = spider.rule_id
        url = item['url']
        md5 = hashlib.md5()
        md5.update(url)
        urlmd5 = md5.hexdigest()
        site_name = utils.get_site(item['url'])
        # site_name = spider.rule['allow_domains']
        html_title = item['html_title']
        # html_body = item['html_body']
        save_path = utils.md5dir(item['url'])
        save_time = int(time.time())
        title = item['title'] if 'title' in item else ""
        body = item['body'] if 'body' in item else ""
        thumb = item['thumb'] if 'thumb' in item else ""
        img_list = item['img_list'] if 'img_list' in item else ""

        # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳
        publish_time = utils.smart2date(
            item['publish_time']) if 'publish_time' in item else ""
        source_site = item['source_site'] if 'source_site' in item else ""
        flag = default_page_flag

        page = Page(rule_id=rule_id,
                    url=item['url'],
                    urlmd5=urlmd5,
                    site_name=site_name,
                    html_title=html_title,
                    save_path=save_path,
                    save_time=save_time,
                    title=title,
                    thumb=thumb,
                    img_list=img_list,
                    body=body,
                    publish_time=publish_time,
                    source_site=source_site,
                    flag=flag)
        has = db.query(Page).filter(Page.urlmd5 == urlmd5).first()
        if has:
            page = Page(rule_id=rule_id,
                        url=item['url'],
                        site_name=site_name,
                        html_title=html_title,
                        save_path=save_path,
                        save_time=save_time,
                        title=title,
                        thumb=thumb,
                        img_list=img_list,
                        body=body,
                        publish_time=publish_time,
                        source_site=source_site,
                        flag=flag)

        db.add(page)
        try:
            db.commit()
            utils.save_file('%s/%s' % (html_path, save_path),
                            item['html_body'])
            redis.set('url:%s' % url, 1)
        except exc.SQLAlchemyError, e:
            raise DropItem("SaveDbError: %s,%s" % (url, format(e)))