class URIBloomFilter(BaseDupeFilter): def __init__(self, settings, debug = False): self.capacity = settings.getint("DUPEFILTER_CAPACITY") self.filename = settings.get("DUPEFILTER_FILENAME") self.debug = debug self.error_rate = 0.01 self.logger = logging.getLogger(__name__) self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) @classmethod def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') return cls(settings, debug) def request_seen(self, request): fp = self.request_fingerprint(request) if self.check(fp): return True else: self.insert(fp) ###-------todo-------## def request_fingerprint(self, request): return request_fingerprint(request) def check(self, request): ret = request in self.bloom_filter_ return ret def insert(self, request): self.bloom_filter_.add(request) #print len(self.bloom_filter_) #print self.bloom_filter_.hash_seeds #print self.bloom_filter_.num_bits #print self.bloom_filter_.num_hashes def reset(self): self.bloom_filter_.clear_all() def save(self): pass def load(self): self.bloom_filter_.sync() self.bloom_filter_.open("bloom.dump") pass def log(self, request, spider): if self.debug: msg = "Filtered duplicate request: %(request)s" self.logger.debug(msg, {'request': request}, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
class bloomFilter(): # Create an empty bloom filter def create_new_bf(self, capacity, error_rate, filename): self.bf = BloomFilter(capacity, error_rate, filename) # Open an existing bloom filter def open_bf(self, filename): self.bf = BloomFilter.open(filename) def add_item(self, item): self.bf.add(item) def check_membership(self, item): return item in self.bf def clear_all(self): self.bf.clear_all()
class MySpider(object): def __init__(self, start_url, basic_url): self.basic_url = basic_url self.start_url = start_url self.mysql = mysql.Mysql() self.re = re self.time = time self.datetime = datetime self.requests = requests # 使用bloom_filter去重,每次从文件中读取dump.bloom if os.path.isfile('filter.bloom'): self.bf = BloomFilter.open('filter.bloom') else: self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') def __get_time(self): return self.datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") def __cal_time(self, date1, date2): date1 = self.time.strptime(date1, "%Y-%m-%d %H:%M:%S") date2 = self.time.strptime(date2, "%Y-%m-%d %H:%M:%S") date1 = self.datetime.datetime(date1[0], date1[1], date1[2], date1[3], date1[4], date1[5]) date2 = self.datetime.datetime(date2[0], date2[1], date2[2], date2[3], date2[4], date2[5]) return str(date2 - date1) def __log(self, log_str, *args, **kw): current_time = self.__get_time() print current_time + ' : ' + log_str, for each in args: print each, print '\n', for each in kw.keys(): print current_time + ' : ' + each + ' is ' + kw[each] def __process_text(self, my_str): my_str = self.re.sub("http.*?html", "", my_str).encode('utf-8') if type(my_str) == 'unicode': my_str = my_str.encode('utf-8') return my_str.replace(" ", "").replace(" ", "").replace('\n', '') def __open_url(self, url): req = self.requests.get(url) content = req.text soup = BeautifulSoup(content) return soup def __process_sub_content(self, result, insert_id): for each in result: # print '\n' + '楼层:' # 时间以及作者 # print each.attrs.get('js_restime') # print each.a.text # 内容 # bbs_content = each.select("[class~=bbs-content]") # text = bbs_content[0].text.strip() # 去除链接以及空格 # text = re.sub("http.*?html", "", text).encode('utf-8') # text = text.replace(" ", "").replace(" ", "") # print self.__process_text(text) # print process_text(text) replies = each.select('ul li') for reply in replies: self.__log('process the reply ... start') reply_time = reply.get('_replytime') reply_author = reply.get('_username') reply_content = reply.select("[class~=ir-content]") reply_text = reply_content[0].text reply_dict = { "title_id": insert_id, "author": reply_author, "time": reply_time, "text": reply_text } self.__log('content is', reply_text) self.__log('insert to database ...start') self.mysql.insert_data('reply', reply_dict) self.__log('insert to database ...done') self.__log('process the reply ... done') # 处理每一层楼 def process_content_page(self, url, author, reply_time, insert_id): self.__process_reply_page(url, author, reply_time, insert_id) def __process_reply_page(self, url, author, reply_time, insert_id): self.__log('process reply page... start') soup = self.__open_url(url) # 各层楼的tag result = soup.select("[class~=atl-item]") if len(result): self.__log('the html was read successfully') else: self.__log('html read fail. maybe the page is lose. function return') self.__log('process reply page ... done') return # 回复页总页数 page_id = soup.select("form a") if page_id: total_page_num = int(page_id[-2].text) else: total_page_num = 1 self.__log('have read', total_page_num, 'pages') # 首层楼的内容 main_content = result[0].select("[class~=bbs-content]") main_content = main_content[0].text.strip() main_text = self.__process_text(main_content) reply_dict = { "title_id": insert_id, "author": author, "time": reply_time, "text": main_text } self.mysql.insert_data('reply', reply_dict) result = result[1:] self.__log('process every floor') self.__process_sub_content(result, '1') if total_page_num > 1: for num in range(2, total_page_num + 1): self.__log('process the', str(num), 'reply page ... start') next_url = url[:-7]+str(num)+url[-6:] print next_url new_soup = self.__open_url(next_url) result = new_soup.select("[class~=atl-item]") self.__process_sub_content(result, insert_id) self.__log('process the', str(num), 'reply page ... done') self.__log('process reply page ... done') def __process_titles_page(self, page_url): self.__log('reading titles page .... start') req = self.requests.get(page_url) content = req.text soup = BeautifulSoup(content) # 获取所有标题 titles = soup.select('tbody tr') # 去掉不符合的部分 titles = titles[1:] # 对每一个标题进行处理 self.__log('reading titles page .... done') self.__log('processing all titles in', self.start_url, ' ... start') counter = 1 for each in titles: # 获取标题的tag信息 # 注意在beautifulSoup的tag中,空白也是标签,即相邻两个td之间标签还有空白 # 所以下面content索引需要考虑到这点 self.__log('process the', counter, 'title', ' ... start') counter += 1 title_content = each.contents title_href = title_content[1].a.get('href') # 获取标题链接 title_text = title_content[1].text.strip() # 获取标题内容 title_author = title_content[3].a.text # 获取作者 title_click_num = title_content[5].text # 点击数 title_reply_number = title_content[7].text # 获取回复数 title_time = title_content[9].get('title') # 获取时间 sub_href = self.basic_url + title_href # 子链接 # 构造标题的字典,插入标题 title_dict = { "reply_num": title_reply_number, "click_num": title_click_num, "author": title_author, "time": title_time, "link": sub_href, "text": title_text } # for each in title_dict: # print each # print type(title_dict[each]) # 利用链接地址和回复数判断是否重复 # flag = sub_href + title_click_num flag = sub_href if not (self.bf.add(flag)): self.__log('', flag, 'not in bloom filter') self.__log('insert to database ... start') insert_id = self.mysql.insert_data("titles", title_dict) self.__log('insert to database ... done') self.__process_reply_page(sub_href, title_author.encode('utf-8'), title_time, str(insert_id)) self.__log('process the', counter, 'title', ' ... done') # 下一页的链接 next_page_tag = soup.find('a', text='下一页') if next_page_tag: next_page = next_page_tag.get('href') next_page = self.basic_url + next_page else: next_page = None return next_page # 清空bloom filter def clean_bloom_filter(self): self.__log('clean all in bloom filter ... start') self.bf.clear_all() self.__log('clean all in bloom filter ... done') def bloom_filter_len(self): return len(self.bf) def main(self): self.__log('spider ... start') self.__log('process start url ... running') next_page = self.__process_titles_page(self.start_url) self.__log('process start url ... done') start_time = self.__get_time() print start_time depth = 1 while next_page: # if depth == 2: # break self.__log('now it is the', str(depth), 'page') next_page = self.__process_titles_page(next_page) depth += 1 end_time = self.__get_time() print end_time duration = self.__cal_time(start_time, end_time) self.__log('duration are', duration) self.__log('spider ... done') def clean_table(self, table): self.mysql.clean_table(table) def test(self): test_url = 'http://bbs.tianya.cn/post-333-778768-1.shtml' print self.bf.add(test_url)
import time import re import Queue import redis #!!! db=1 use db1 to store the seeds r = redis.Redis(host='192.168.134.235',password='******', db=1) done_sites_fname='done_sites.bin' try: bfdone = BloomFilter.open(done_sites_fname) except: print "can not open file, create it" bfdone = BloomFilter(2**23, 0.00001, done_sites_fname) #8M bfdone.clear_all() #first check the bf f = "urls_uniq.txt" urls = open(f).read().strip().split('\n') for url in urls: if url in bfdone: print "Error" exit(0) print "BF is ok" # here we got id in each db increase from 1 to n, rather than sequencely for all db cmd = "select id from mainpages"
class MySpider(object): def __init__(self): self.mysql = mysql.Mysql() self.re = re self.time = time self.datetime = datetime self.requests = requests # 使用bloom_filter去重,每次从文件中读取dump.bloom if os.path.isfile("new_filter.bloom"): self.bf = BloomFilter.open("new_filter.bloom") else: self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom") def __process_text(self, my_str): my_str = self.re.sub("http.*?html", "", my_str).encode("utf-8") if type(my_str) == "unicode": my_str = my_str.encode("utf-8") return my_str.replace(" ", "").replace(" ", "").replace("\n", "") def open_url(self, url): html = self.requests.get(url) code = html.encoding # print code content = html.content.decode(code, "ignore") soup = BeautifulSoup(content) return soup def process_content_page(self, url): soup = self.open_url(url) body = soup.find_all("p") # print soup.contents content = "" for each in body: content += each.text.strip() # print each.text.strip() return self.__process_text(content) def process_title_page(self, url): soup = self.open_url(url) result = soup.find("table", class_="mt12 p14") titles = result.find_all("tr") titles = titles[1:-1] # 处理每个标题 for each in titles: title_href = each.a.get("href") if not self.bf.add(title_href): text = each.text.strip() title_time = "20" + text[-14:] + ":00" content = text[1:-15].strip() print title_time + "\n" + content + "\n" + title_href title_text = self.process_content_page(title_href) # 构造插入字典 title_dict = { "link": title_href, "title": content, "text": title_text.decode("utf-8", "ignore"), "time": title_time, } # 插入数据库 self.mysql.insert_data("gzrb_titles", title_dict) # 得到下一页 result = soup.find("table", class_="mt12 p12") result = result.find_all("a") if len(result) == 1: next_page_href = result[0].get("href") # print result[0].text elif len(result) == 2: next_page_href = result[1].get("href") # print result[1].text else: next_page_href = None # print next_page_href return next_page_href def clean_bloom_filter(self): self.bf.clear_all() def process_nav(self, url): # 对于每个标题页,如果有下一页,则继续迭代 next_page = self.process_title_page(url) # 爬取深度 depth = 1 while next_page: if not self.bf.add(next_page): next_page = self.process_title_page(next_page) else: next_page = None if depth == 10: return depth += 1 def main(self, start_url): soup = self.open_url(start_url) # 获得导航栏链接 nav = soup.find("div", class_="nav") result = nav.find_all("li") # 去掉# result = result[1:-1] for each in result: nav_href = each.a.get("href") # 得到每个nav链接 # print nav_href self.process_nav(nav_href) # 处理每个nav