def add_message(message, routing_key, queue,queue_durable=False, exchange='',exchange_type=None,try_number=100): try: connection = RabbitmqServer.get_connection() channel = connection.channel() # 定义一个用来接收message的queue,同时为了保证消息不丢失,durable决定该queue进行持久化。 channel.queue_declare(queue=queue, durable=queue_durable) # 定义一个exchange,用来传输message if exchange != '': channel.exchange_declare(exchange=exchange, type=exchange_type) channel.queue_bind(exchange=exchange, routing_key=routing_key, queue=queue) # 将message任务发送到服务器,同时为了保证每一个在做的message任务不丢失,delivery_mode参数决定将消息进行持久化。 channel.basic_publish(exchange=exchange, routing_key=routing_key, body=message, properties=pika.BasicProperties(delivery_mode=2) # make the message persistent. ) print "[x] Sent %r" % message else: # 将message任务发送到服务器,同时为了保证每一个在做的message任务不丢失,delivery_mode参数决定将消息进行持久化。 channel.basic_publish(exchange=exchange, routing_key=queue, body=message, properties=pika.BasicProperties(delivery_mode=2) # make the message persistent. ) print "[x] Sent %r" % message connection.close() except Exception,e: print traceback.format_exc(),e.message FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + e.message)
def __process_request_xpath__(self, request, timeout=100, whether_decode=False, encode_type='utf-8'): """ 处理request请求,返回一个可以使用xpath语法的数据类型。 :param request: 需要处理的request。 :param timeout:超时时间 :param whether_decode: 是否需要转换编码 :return:返回一个可以用xpath解析的selector格式。 """ try: response = urllib2.urlopen(request, timeout=timeout) try: doc = response.read() response.close() if whether_decode == True: doc = doc.decode(encode_type, 'ignore') else: pass doc = lxml.etree.HTML(doc) except Exception, e: FileIO.exceptionHandler(message=e.message) print traceback.format_exc(), e.message doc = None return doc
def parse(self): try: day_url = self.selector.xpath('//ul[@class="club_Date clearfix"]/li/a/@href') return day_url except Exception as e: print(traceback.format_exc(),e.args[0], e.args[1]) FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + str(e.args[0]) + str(e.args[1])) return None
def parse(self): day_question_list = [] try: question_content_list = self.selector.xpath( '//div[@class="bc mt15 DiCeng"]/div[@class="club_dic"]') for question_content in question_content_list: question = {} try: question['disease'] = question_content.xpath( 'h4/var/a/text()')[0] question['disease_url'] = question_content.xpath( 'h4/var/a/@href')[0] except: question['disease'] = '' question['disease_url'] = '' try: question['question_url'] = question_content.xpath( 'h4/em/a/@href')[0] question['question_title'] = question_content.xpath( 'h4/em/a/text()')[0] except: question['question_url'] = '' question['question_title'] = '' try: question['question_body'] = question_content.xpath( 'div/p/text()')[0] except: question['question_body'] = '' post_date = self.url_count['url'].split('/')[4] question['post_date'] = post_date question['post_month'] = post_date[0:7] day_question_list.append(question) #将抓取的问题信息保存到对应的queue中。 RabbitmqServer.add_message( message=json.dumps(question), routing_key=question_queue_exchange['routing_key'], queue=question_queue_exchange['queue'], queue_durable=question_queue_exchange['queue_durable'], exchange=question_queue_exchange['exchange'], exchange_type=question_queue_exchange['exchange_type']) except Exception as e: print(traceback.format_exc(), e.args[0], e.args[1]) FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + str(e.args[0]) + str(e.args[1])) #本次尝试失败且尝试次数小于10次,将这个页面的url放回原来的queue中,待下一次尝试。 if int(self.url_count['try_number']) <= url_try_number: RabbitmqServer.add_message( message=self.url_count, routing_key=page_queue_exchange['routing_key'], queue=page_queue_exchange['queue'], queue_durable=page_queue_exchange['queue_durable'], exchange=page_queue_exchange['exchange'], exchange_type=page_queue_exchange['exchange_type']) else: pass
def exchange_delete(exchange): try: connection = RabbitmqServer.get_connection() channel = connection.channel() channel.exchange_delete(exchange=exchange) connection.close() except Exception,e: print traceback.format_exc(),e.message FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + e.message)
def queue_delete(queue): try: connection = RabbitmqServer.get_connection() channel = connection.channel() channel.queue_delete(queue=queue) connection.close() except Exception as e: print(traceback.format_exc()) FileIO.exceptionHandler(message=traceback.format_exc())
def get_connection(try_number=100): success = False for index in range(0, try_number, 1): print 'number', index try: connection = pika.BlockingConnection(connection_parameter) success = True return connection except Exception,e: print traceback.format_exc(), e.message FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + e.message) if success == True: break else: pass
def __init__(self, queue, queue_durable=False,try_number=100): self.queue=queue self.queue_durable = queue_durable success = False for index in range(0, try_number,1): try: self.connection = pika.BlockingConnection(connection_parameter) self.channel = self.connection.channel() self.channel.queue_declare(queue=queue, durable=queue_durable) success = True except Exception, e: print traceback.format_exc(), e.message FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + e.message) if success: break else: pass
def get_zhuanjia_doctor(self, url=None): if url == None: sel = self.process_url_request(url=self.url, whether_decode=True, encode_type='GBK') else: sel = self.process_url_request(url=url, whether_decode=True, encode_type='GBK') mode = re.compile(r'\d+') doctor = {} doctor['doctor_url'] = self.url try: doctor['name'] = sel.xpath( '//h3[@class="fn"]/a/text()')[0].replace('专家网站', '') doctor['title'] = sel.xpath( '//div[@class="fl f12 lightblue-a lh200 pt5"]/p[2]/text()' )[0].split(' ')[0] doctor['profession'] = sel.xpath( '//div[@class="fl f12 lightblue-a lh200 pt5"]/p[2]/text()' )[0].split(' ')[1] doctor['hospital'] = sel.xpath( '//div[@class="fl f12 lightblue-a lh200 pt5"]/p[3]/a/text()' )[0] doctor['department'] = sel.xpath( '//div[@class="fl f12 lightblue-a lh200 pt5"]/p[4]/a/text()' )[0] doctor['skill'] = sel.xpath( '//div[@id="goodat"]/text()')[0].replace('擅长:', '') doctor['introduce'] = sel.xpath( '//div[@id="person_info"]/text()')[0].replace('简介:', '') doctor['help_user'] = sel.xpath( '//div[@class=" f12 padd10 lh30"]/span[4]/a/text()')[0] doctor['good_comment'] = sel.xpath( '//div[@class=" f12 padd10 lh30"]/span[5]/a/text()')[0] doctor['crawl_time'] = datetime.datetime.now().strftime('%Y-%m-%d') Doctor.print_doctor(doctor) self.mysql.insert(table=self.zhuanjia_doctor_table, record=doctor) return doctor except Exception, e: print traceback.format_exc(), e.message FileIO.exceptionHandler(traceback.format_exc(), url=self.url) FileIO.writeToFile(self.url, './../data/answer_doctor_url_error.csv') return None
def parse(self): """ 处理某一天的url,这里将获取一天有多少页,并将这些页面的url保存到一个queue中, :return: """ try: page_numer_content = self.selector.xpath( '//div[@class="club_page"]/div/text()')[-1] mode = re.compile(r'\d+') page_numer = (mode.findall(page_numer_content)[0]).encode('utf-8') page_numer = int(page_numer) print(self.url_count['url'], type(page_numer), page_numer) url_preffix = self.url_count['url'][0:len(self.url_count['url']) - 6] # 成功获取页面数,将页面url保存到对应的queue中 for number in range(1, page_numer + 1, 1): url = url_preffix + str(number) + '.html' print('page url:', url) url_count = {} url_count['url'] = url url_count['try_number'] = 0 RabbitmqServer.add_message( message=json.dumps(url_count), routing_key=page_queue_exchange['routing_key'], queue=page_queue_exchange['queue'], queue_durable=page_queue_exchange['queue_durable'], exchange=page_queue_exchange['exchange'], exchange_type=page_queue_exchange['exchange_type']) except Exception as e: print(traceback.format_exc(), e.args[0], e.args[1]) FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + str(e.args[0]) + str(e.args[1])) # 本次失败且尝试次数小于10次,将这一天的url重新放回保存日期url的queue中,等待下一次尝试。 if self.url_count['try_number'] <= url_try_number: RabbitmqServer.add_message( message=json.dumps(self.url_count), routing_key=day_queue_exchange['routing_key'], queue=day_queue_exchange['queue'], queue_durable=day_queue_exchange['queue_durable'], exchange=day_queue_exchange['exchange'], exchange_type=day_queue_exchange['exchange_type']) else: pass
def get_connection(try_number=100): success = False for index in range(0, try_number, 1): print('get connection number', index) try: connection = pika.BlockingConnection(connection_parameter) success = True return connection except Exception as e: print(traceback.format_exc()) FileIO.exceptionHandler(message=traceback.format_exc()) if success == True: break else: pass if success == False: sys.exit(1) else: pass
def __process_request__(self, request, timeout=100): """ 处理request请求,以string的类型返回网页内容。 :param request:request的请求 :param timeout:超时时间 :return:返回内容,失败返回None """ try: response = urllib.request.urlopen(request, timeout=timeout) doc = response.read() return doc except URLError as e: if hasattr(e, 'reason'): print('We failed to raach a server.') print('Reaseon: ', e.reason) elif hasattr(e, 'code'): print('The server could not fulfill the request.') print('Error code: ', e.code) print('Reason: ', e.reason) FileIO.exceptionHandler(message=e.reason) print(traceback.format_exc(), e.reason) return None except socket.timeout as e: print('Error code: socket timeout', e) FileIO.exceptionHandler(message=e.args[0]) print(traceback.format_exc(), e.args[0]) return None except Exception as e: FileIO.exceptionHandler(message=e.args[0]) print(traceback.format_exc(), e.args[0]) print('Do Not know what is wrong.') return None
def __process_request__(self, request, timeout=100): """ 处理request请求,以string的类型返回网页内容。 :param request:request的请求 :param timeout:超时时间 :return:返回内容,失败返回None """ try: response = urllib2.urlopen(request, timeout=timeout) doc = response.read() return doc except URLError, e: if hasattr(e, 'reason'): print 'We failed to raach a server.' print 'Reaseon: ', e.reason elif hasattr(e, 'code'): print 'The server could not fulfill the request.' print 'Error code: ', e.code print 'Reason: ', e.reason FileIO.exceptionHandler(message=e.message) print traceback.format_exc(), e.message return None
def __process_request_xpath__(self, request, timeout=100, whether_decode=False, encode_type='utf-8'): """ 处理request请求,返回一个可以使用xpath语法的数据类型。 :param request: 需要处理的request。 :param timeout:超时时间 :param whether_decode: 是否需要转换编码 :return:返回一个可以用xpath解析的selector格式。 """ try: response = urllib.request.urlopen(request, timeout=timeout) try: doc = response.read() response.close() if whether_decode == True: doc = doc.decode(encode_type, 'ignore') else: pass doc = lxml.etree.HTML(doc) except Exception as e: FileIO.exceptionHandler(message=e.args[0]) print(traceback.format_exc(), e.args[0]) doc = None return doc except URLError as e: if hasattr(e, 'reason'): print('We failed to raach a server.') print('Reaseon: ', e.reason) elif hasattr(e, 'code'): print('The server could not fulfill the request.') print('Error code: ', e.code) print('Reason: ', e.reason) FileIO.exceptionHandler(message=e.reason) print(traceback.format_exc(), e.reason) return None except socket.timeout as e: print('Error code: socket timeout', e) FileIO.exceptionHandler(message=e.args[0]) print(traceback.format_exc(), e.args[0]) return None except Exception as e: FileIO.exceptionHandler(message=e.args[0]) print(traceback.format_exc(), e.args[0]) print('Do Not know what is wrong.') return None
def get_family_doctor(self, url=None): if url == None: sel = self.process_url_request(url=self.url, whether_decode=True, encode_type='GBK') else: sel = self.process_url_request(url=url, whether_decode=True, encode_type='GBK') mode = re.compile(r'\d+') doctor = {} doctor['doctor_url'] = self.url try: name = sel.xpath('//h3[@class="fn clearfix cl"]/i/text()') if len(name) == 1: doctor['name'] = name[0].replace('医生个人主页', '') doctor['title'] = sel.xpath( '//div[@class=" lh200 pt10 f14"]/text()')[0] doctor['hospital'] = sel.xpath( '//div[@class=" lh200 pt10 f14"]/text()')[1].split('-')[0] doctor['department'] = sel.xpath( '//div[@class=" lh200 pt10 f14"]/text()')[1].split('-')[1] doctor['skill'] = sel.xpath( '//div[@class="clearfix"]/div[1][@class="HomeJie f14 fwei pt20"]/div/text()' )[0] introduce = sel.xpath( '//div[@class="clearfix"]/div[2][@class="HomeJie f14 fwei pt20"]/div/text()' ) if len(introduce) != 0: doctor['introduce'] = introduce[0] else: pass doctor['reputation'] = sel.xpath( '//div[@class="clearfix mt20"]/span/text()')[0] help_content = sel.xpath( '//div[@class="f14 fwei HomeHelp tc lh200 clearfix pt10"]/span/text()' ) if len(help_content) == 2: doctor['help_user'] = sel.xpath( '//div[@class="f14 fwei HomeHelp tc lh200 clearfix pt10"]/span/text()' )[0] doctor['sign'] = sel.xpath( '//div[@class="f14 fwei HomeHelp tc lh200 clearfix pt10"]/span/text()' )[1] else: doctor['help_user'] = sel.xpath( '//div[@class="f14 fwei HomeHelp tc lh200 clearfix pt10"]/span/text()' )[0] doctor['crawl_time'] = datetime.datetime.now().strftime( '%Y-%m-%d') else: url = 'http://club.xywy.com/doc_card/' + self.url.split( '/')[-1] self.url = url doctor = self.get_answer_doctor(url=url) Doctor.print_doctor(doctor) self.mysql.insert(table=self.family_doctor_table, record=doctor) return doctor except Exception, e: print traceback.format_exc(), e.message FileIO.exceptionHandler(traceback.format_exc(), url=self.url) FileIO.writeToFile(self.url, './../data/answer_doctor_url_error.csv') return None
def get_answer_doctor(self, url=None): if url == None: sel = self.process_url_request(url=self.url, whether_decode=True, encode_type='GBK') else: sel = self.process_url_request(url=url, whether_decode=True, encode_type='GBK') mode = re.compile(r'\d+') doctor = {} doctor['doctor_url'] = self.url try: profile_list = sel.xpath( '//ul[@class="fl bdul f14"]/li/span[2]/text()') doctor['name'] = profile_list[0] doctor['title'] = profile_list[1] doctor['department'] = profile_list[2] if len( sel.xpath( '//ul[@class="bdxli pt10 f12 clearfix black"]/li/span/text()' )) == 5: doctor['grade'] = sel.xpath( '//ul[@class="bdxli pt10 f12 clearfix black"]/li[1]/span/text()' )[0] doctor['best_reply'] = mode.findall( sel.xpath( '//ul[@class="bdxli pt10 f12 clearfix black"]/li[2]/span/text()' )[0])[0] doctor['help_user'] = mode.findall( sel.xpath( '//ul[@class="bdxli pt10 f12 clearfix black"]/li[3]/span/text()' )[0])[0] doctor['gratitude_user'] = \ mode.findall(sel.xpath('//ul[@class="bdxli pt10 f12 clearfix black"]/li[5]/span/text()')[0])[0] doctor['fan'] = \ mode.findall(sel.xpath('//ul[@class="bdxli pt10 f12 clearfix black"]/li[6]/span/text()')[0])[0] reputation_content = sel.xpath( '//ul[@class="bdxli pt10 f12 clearfix black"]/li[4]/cite/img/@src' ) doctor['reputation_type'] = reputation_content[0].split( 'ysmp/')[1].replace('.gif', '') doctor['reputation'] = len(reputation_content) else: pass doctor['skill'] = sel.xpath( '//div[@class="clearfix cl djzhan f12 mr10 pt10 pb10 none"]/p/text()' )[0].replace('擅长疾病:', '') doctor['hospital'] = sel.xpath( '//div[@class="clearfix cl djzhan f12 mr10 pt10 pb10 none"]/p/text()' )[1].replace('所在医院:', '') doctor['introduce'] = sel.xpath( '//div[@class="clearfix cl djzhan f12 mr10 pt10 pb10 none"]/div/text()' )[0].replace('个人简介:', '') doctor['crawl_time'] = datetime.datetime.now().strftime('%Y-%m-%d') print '=================================================================' Doctor.print_doctor(doctor) self.mysql.insert(table=self.answer_doctor_table, record=doctor) return doctor except Exception, e: print traceback.format_exc(), e.message FileIO.exceptionHandler(traceback.format_exc(), url=self.url) FileIO.writeToFile(self.url, './../data/answer_doctor_url_error.csv') return None
class BaseSpider(object): """ 基础的爬虫类,实现user_agent的随机选取,从url到request再到需要的网页数据类型,可以转化成使用xpath提取的类型,也可以 以string的类型获得网页源码。 """ # def __init__(self): # pass def get_header(self): """ 获得头文件。 :return:返回一个header。 """ return {'User-Agent': random.choice(user_agents)} def set_proxy(self): proxy_temp = {'http': random.choice(proxies)} proxy_handler = urllib2.ProxyHandler(proxy_temp) opener = urllib2.build_opener(proxy_handler) urllib2.install_opener(opener=opener) print '*********', proxy_temp def process_url_request(self, url, try_number=20, timeout=100, xpath_type=True, whether_decode=False, encode_type='utf-8', use_proxy=False): """ 从一个url,返回该url对应的网页内容,根据需求不同,返回不同数据类型的网页数据。 :param url: 目标url :param try_number: 尝试的次数 :param timeout: 超时时间。 :param xpath_type: 是否转化成可以使用xpath的数据类型。 :param whether_decode: 是否需要转换编码。 :param encode_type: 如果需要转换编码,则编码格式是什么。 :param use_proxy: 是否使用代理服务器,boolean :return: 返回对应的数据,多次尝试失败后返回None. """ doc = None try_index = 0 if xpath_type == True: while doc == None: if use_proxy == True: self.set_proxy() else: pass request = urllib2.Request(url=url, headers=self.get_header()) doc = self.__process_request_xpath__( request=request, timeout=timeout, whether_decode=whether_decode, encode_type=encode_type) try_index = try_index + 1 if try_index > try_number: break else: pass return doc else: while doc == None: request = urllib2.Request(url=url, headers=self.get_header()) doc = self.__process_request__(request=request, timeout=timeout) try_index = try_index + 1 if try_index > try_number: break else: pass return doc def __process_request_xpath__(self, request, timeout=100, whether_decode=False, encode_type='utf-8'): """ 处理request请求,返回一个可以使用xpath语法的数据类型。 :param request: 需要处理的request。 :param timeout:超时时间 :param whether_decode: 是否需要转换编码 :return:返回一个可以用xpath解析的selector格式。 """ try: response = urllib2.urlopen(request, timeout=timeout) try: doc = response.read() response.close() if whether_decode == True: doc = doc.decode(encode_type, 'ignore') else: pass doc = lxml.etree.HTML(doc) except Exception, e: FileIO.exceptionHandler(message=e.message) print traceback.format_exc(), e.message doc = None return doc except URLError, e: if hasattr(e, 'reason'): print 'We failed to raach a server.' print 'Reaseon: ', e.reason elif hasattr(e, 'code'): print 'The server could not fulfill the request.' print 'Error code: ', e.code print 'Reason: ', e.reason FileIO.exceptionHandler(message=e.message) print traceback.format_exc(), e.message return None
doc = None return doc except URLError, e: if hasattr(e, 'reason'): print 'We failed to raach a server.' print 'Reaseon: ', e.reason elif hasattr(e, 'code'): print 'The server could not fulfill the request.' print 'Error code: ', e.code print 'Reason: ', e.reason FileIO.exceptionHandler(message=e.message) print traceback.format_exc(), e.message return None except socket.timeout, e: print 'Error code: socket timeout', e FileIO.exceptionHandler(message=e.message) print traceback.format_exc(), e.message return None except Exception, e: FileIO.exceptionHandler(message=e.message) print traceback.format_exc(), e.message print 'Do Not know what is wrong.' return None def __process_request__(self, request, timeout=100): """ 处理request请求,以string的类型返回网页内容。 :param request:request的请求 :param timeout:超时时间 :return:返回内容,失败返回None """