def process_item(self, item, spider): # try: # con = MySQLdb.connect(**config.db_config) # cur = con.cursor() # sql = "INSERT INTO leiju_proxy (ip,port,proto,checked_at,created_at) VALUES (%s,%s,%s,%s,%s)" # parmam = [(item['ip'], item['port'], 'http', int(time.time()), int(time.time()))] # cur.executemany(sql, parmam) # con.commit() # con.close() # # return item # except Exception, e: # logging.info("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e))) # raise DropItem("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e))) db = DBSession() md5 = hashlib.md5() md5.update(item['ip'] + "." + item['port']) haship = md5.hexdigest() proxy = Proxy(haship=haship, ip=item['ip'], port=item['port'], create_time=int(time.time())) db.add(proxy) try: db.commit() except exc.SQLAlchemyError, e: raise DropItem("SaveError: %s:%s %s" % (item['ip'], item['port'], format(e)))
def delete_proxy(ip, port): """ 删除数据库中的代理 :param ip: ip :param port: 端口 """ if ip != "" and port != "": db = DBSession() db.query(Proxy).filter(Proxy.ip == ip).filter(Proxy.port == port).delete() try: db.commit() return True except exc.SQLAlchemyError, e: logging.info("Delete Proxy Error:", format(e)) return False
def delete_proxy(ip, port): """ 删除数据库中的代理 :param ip: ip :param port: 端口 """ if ip != "" and port != "": db = DBSession() db.query(Proxy).filter(Proxy.ip == ip).filter( Proxy.port == port).delete() try: db.commit() return True except exc.SQLAlchemyError, e: logging.info("Delete Proxy Error:", format(e)) return False
def process_item(self, item, spider): db = DBSession() redis = confRedis rule_id = spider.rule_id url = item['url'] md5 = hashlib.md5() md5.update(url) urlmd5 = md5.hexdigest() site_name = utils.get_site(item['url']) # site_name = spider.rule['allow_domains'] html_title = item['html_title'] # html_body = item['html_body'] save_path = utils.md5dir(item['url']) save_time = int(time.time()) title = item['title'] if 'title' in item else "" body = item['body'] if 'body' in item else "" thumb = item['thumb'] if 'thumb' in item else "" img_list = item['img_list'] if 'img_list' in item else "" # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳 publish_time = utils.smart2date(item['publish_time']) if 'publish_time' in item else "" source_site = item['source_site'] if 'source_site' in item else "" flag = default_page_flag page = Page(rule_id=rule_id, url=item['url'], urlmd5=urlmd5, site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) has = db.query(Page).filter(Page.urlmd5 == urlmd5).first() if has: page = Page(rule_id=rule_id, url=item['url'], site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) db.add(page) try: db.commit() utils.save_file('%s/%s' % (html_path, save_path), item['html_body']) redis.set('url:%s' % url, 1) except exc.SQLAlchemyError, e: raise DropItem("SaveDbError: %s,%s" % (url, format(e)))
def process_item(self, item, spider): db = DBSession() redis = confRedis rule_id = spider.rule_id url = item['url'] md5 = hashlib.md5() md5.update(url) urlmd5 = md5.hexdigest() site_name = utils.get_site(item['url']) # site_name = spider.rule['allow_domains'] html_title = item['html_title'] # html_body = item['html_body'] save_path = utils.md5dir(item['url']) save_time = int(time.time()) title = item['title'] if 'title' in item else "" body = item['body'] if 'body' in item else "" thumb = item['thumb'] if 'thumb' in item else "" img_list = item['img_list'] if 'img_list' in item else "" # TODO 这里使用一个分析方法,分析抓取到数据的发布时间,然后转换成时间戳 publish_time = utils.smart2date( item['publish_time']) if 'publish_time' in item else "" source_site = item['source_site'] if 'source_site' in item else "" flag = default_page_flag page = Page(rule_id=rule_id, url=item['url'], urlmd5=urlmd5, site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) has = db.query(Page).filter(Page.urlmd5 == urlmd5).first() if has: page = Page(rule_id=rule_id, url=item['url'], site_name=site_name, html_title=html_title, save_path=save_path, save_time=save_time, title=title, thumb=thumb, img_list=img_list, body=body, publish_time=publish_time, source_site=source_site, flag=flag) db.add(page) try: db.commit() utils.save_file('%s/%s' % (html_path, save_path), item['html_body']) redis.set('url:%s' % url, 1) except exc.SQLAlchemyError, e: raise DropItem("SaveDbError: %s,%s" % (url, format(e)))