def recrawl(): csor, conn = getDushuConn() gid = 300000 while gid > 0: sql = "select id,rawUrl,content from cn_dushu_acticle where id > %d and id < %s and bookId != 49316 ORDER by id desc" % ( gid - 1000, gid) print sql csor.execute(sql) id = id - 1000 results = csor.fetchall() for row in results: content = row[2] # content = row[4].replace('mi', 'mo') id = row[0] url = row[1] dealUrl() csor.close()
def recrawl(): csor, conn = getDushuConn() while id > 0: sql = "select id,rawUrl,content from cn_dushu_acticle where id > %d and id < %s and bookId != 49316 ORDER by id desc" % ( id - 1000, id) print 'begin' csor.execute(sql) time.sleep(2) id = id - 1000 results = csor.fetchall() for row in results: content = row[2] # content = row[4].replace('mi', 'mo') id = row[0] url = row[1] # url = 'http://b.easou.com/w/read/85356/14075857/6.html' # #跳过的host # continue1 = False # for ig in ignores['hosts']: # if ig in url: # continue1 = True # continue # if continue1: # continue # # try: # newContent,redUrl = getContentAndRedictedUrl(url) # # except Exception as e: # print 'new content1',e # try: # newContent, redUrl = getContentAndRedictedUrl(url) # # except Exception as e: # print 'new content1', e # continue # except requests.exceptions.ConnectionError as er: # print 'new content2', er # continue # except requests.exceptions.ConnectionError as er: # print 'new content2',er # try: # newContent, redUrl = getContentAndRedictedUrl(url) # # except Exception as e: # print 'new content1', e # continue # except requests.exceptions.ConnectionError as er: # print 'new content2', er # continue # # if not redUrl: # continue # # #对跳转后的url,再过滤一遍 # continue2 = False # for ig in ignores['hosts']: # if ig in redUrl: # continue2 = True # continue # if continue2: # continue # # urlHost = urlparse(redUrl).hostname # # new2 = newContent.encode('utf-8') # soup = getSoupByStr(new2, "utf-8") # # # # #统一清理通用噪声 # # for rm in rules['common']['rm']: # removeNodesFromSoup(rm, soup) # 删除停止node # # # if rules.has_key(urlHost): # contentRule = rules[urlHost]['content'] # if contentRule: #有配置正文规则 # specContent = soup.select(contentRule)#根据配置,抽取正文 # if specContent and len(specContent) > 0: # del specContent[0].attrs # soup = specContent[0] # #不管有没有配置正文规则,都应该遍历删除rm节点 # if rules[urlHost]['rm'] and len(rules[urlHost]['rm']) > 0: # for rm in rules[urlHost]['rm']: # removeNodesFromSoup(rm, soup)#删除停止node # # # unwrap常见无用标签 # for a in soup.select('a'): # a.unwrap() # for a in soup.select('b'): # a.unwrap() # for a in soup.select('font'): # a.unwrap() # for a in soup.select('span'): # a.unwrap() # # content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \ # .replace(u'</div>', '').replace(u'<div>', '') # # else: #m没有配置任何规则,自动抽取正文 # print urlHost,' : ',id # continue # # doc = Document(unicode(soup)) # # content = doc.summary(html_partial=True) # # urlContents[url] = content.encode('utf-8') # # # # newContent2 = cleanTailHead(urlHost, content) # if newContent2 != content: # content = newContent2 # # if content and len(content) < 10: # continue # # # newSoup = getSoupByStr(content) # # newSoup.select('div')[0].unwrap() # # # content = unicode(newSoup).replace(u'<body>','').replace(u'</body>','') # # content = content.replace(r'<p>\d+、.*</b></p>', '') # # # content = re.sub(u'<p>\d+、((?:.|\n)*?)</p>', "", content, 1) # content = content.replace(u'�', u'') # content = content.replace(u'\'', r'\'') content, urlHost = getAndParse(url) if content and urlHost: update(csor, conn, id, urlHost, unicode(content)) csor.close()
##!/usr/bin/python # -*- coding: UTF-8 -*- import time import requests from dbHelper import getDushuConn from framework.htmlParser import getSoupByStr from networkHelper import getContentAndRedictedUrl csor, conn = getDushuConn() id = 825650 lastTime = 0 sites = open(u'3dWebsite.txt', 'w') urlContents = {} # sites.readline() # # from pybloom import BloomFilter # f = BloomFilter(capacity=1000, error_rate=0.001) # [f.add(x) for x in range(10)] def update(id, content): sql = "update cn_dushu_acticle set content = '%s' where id = %s" % \ (content, id)