def parse(self):
     try:
         day_url = self.selector.xpath('//ul[@class="club_Date clearfix"]/li/a/@href')
         return day_url
     except Exception as e:
         print(traceback.format_exc(),e.args[0], e.args[1])
         FileIO.exceptionHandler(message=traceback.format_exc() + '   ' + str(e.args[0]) + str(e.args[1]))
         return None
Beispiel #2
0
    def parse(self):
        day_question_list = []
        try:
            question_content_list = self.selector.xpath(
                '//div[@class="bc mt15 DiCeng"]/div[@class="club_dic"]')
            for question_content in question_content_list:
                question = {}
                try:
                    question['disease'] = question_content.xpath(
                        'h4/var/a/text()')[0]
                    question['disease_url'] = question_content.xpath(
                        'h4/var/a/@href')[0]
                except:
                    question['disease'] = ''
                    question['disease_url'] = ''

                try:
                    question['question_url'] = question_content.xpath(
                        'h4/em/a/@href')[0]
                    question['question_title'] = question_content.xpath(
                        'h4/em/a/text()')[0]
                except:
                    question['question_url'] = ''
                    question['question_title'] = ''

                try:
                    question['question_body'] = question_content.xpath(
                        'div/p/text()')[0]
                except:
                    question['question_body'] = ''
                post_date = self.url_count['url'].split('/')[4]
                question['post_date'] = post_date
                question['post_month'] = post_date[0:7]
                day_question_list.append(question)
                #将抓取的问题信息保存到对应的queue中。
                RabbitmqServer.add_message(
                    message=json.dumps(question),
                    routing_key=question_queue_exchange['routing_key'],
                    queue=question_queue_exchange['queue'],
                    queue_durable=question_queue_exchange['queue_durable'],
                    exchange=question_queue_exchange['exchange'],
                    exchange_type=question_queue_exchange['exchange_type'])
        except Exception as e:
            print(traceback.format_exc(), e.args[0], e.args[1])
            FileIO.exceptionHandler(message=traceback.format_exc() + ' ' +
                                    str(e.args[0]) + str(e.args[1]))
            #本次尝试失败且尝试次数小于10次,将这个页面的url放回原来的queue中,待下一次尝试。
            if int(self.url_count['try_number']) <= url_try_number:
                RabbitmqServer.add_message(
                    message=self.url_count,
                    routing_key=page_queue_exchange['routing_key'],
                    queue=page_queue_exchange['queue'],
                    queue_durable=page_queue_exchange['queue_durable'],
                    exchange=page_queue_exchange['exchange'],
                    exchange_type=page_queue_exchange['exchange_type'])
            else:
                pass
Beispiel #3
0
 def queue_delete(queue):
     try:
         connection = RabbitmqServer.get_connection()
         channel = connection.channel()
         channel.queue_delete(queue=queue)
         connection.close()
     except Exception as e:
         print(traceback.format_exc())
         FileIO.exceptionHandler(message=traceback.format_exc())
Beispiel #4
0
 def exchange_delete(exchange):
     try:
         connection = RabbitmqServer.get_connection()
         channel = connection.channel()
         channel.exchange_delete(exchange=exchange)
         connection.close()
     except Exception,e:
         print traceback.format_exc(),e.message
         FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + e.message)
Beispiel #5
0
def load_file(filename):
    file = open(filename, 'r')
    line = 'department,201601,201602,201603,201604,201605,201606,201607,201608,201609,201610,201611,201612'
    line = 'disease,201601,201602,201603,201604,201605,201606,201607,201608,201609,201610,201611,201612'
    # FileIO.writeToFile(text=line,filename='./../result/department_count_2.csv')
    FileIO.writeToFile(text=line, filename='./../result/disease_count_2.csv')
    for line in file:
        line = json.loads(line)
        print(type(line), line)
        line = load_line(line)
        FileIO.writeToFile(text=line,
                           filename='./../result/disease_count_2.csv')
    file.close()
Beispiel #6
0
 def get_connection(try_number=100):
     success = False
     for index in range(0, try_number, 1):
         print 'number', index
         try:
             connection = pika.BlockingConnection(connection_parameter)
             success = True
             return connection
         except Exception,e:
             print traceback.format_exc(), e.message
             FileIO.exceptionHandler(message=traceback.format_exc() + '  ' + e.message)
         if success == True:
             break
         else:
             pass
Beispiel #7
0
 def __init__(self, queue, queue_durable=False,try_number=100):
     self.queue=queue
     self.queue_durable = queue_durable
     success = False
     for index in range(0, try_number,1):
         try:
             self.connection = pika.BlockingConnection(connection_parameter)
             self.channel = self.connection.channel()
             self.channel.queue_declare(queue=queue, durable=queue_durable)
             success = True
         except Exception, e:
             print traceback.format_exc(), e.message
             FileIO.exceptionHandler(message=traceback.format_exc() + ' ' + e.message)
         if success:
             break
         else:
             pass
Beispiel #8
0
 def get_zhuanjia_doctor(self, url=None):
     if url == None:
         sel = self.process_url_request(url=self.url,
                                        whether_decode=True,
                                        encode_type='GBK')
     else:
         sel = self.process_url_request(url=url,
                                        whether_decode=True,
                                        encode_type='GBK')
     mode = re.compile(r'\d+')
     doctor = {}
     doctor['doctor_url'] = self.url
     try:
         doctor['name'] = sel.xpath(
             '//h3[@class="fn"]/a/text()')[0].replace('专家网站', '')
         doctor['title'] = sel.xpath(
             '//div[@class="fl f12 lightblue-a lh200 pt5"]/p[2]/text()'
         )[0].split(' ')[0]
         doctor['profession'] = sel.xpath(
             '//div[@class="fl f12 lightblue-a lh200 pt5"]/p[2]/text()'
         )[0].split(' ')[1]
         doctor['hospital'] = sel.xpath(
             '//div[@class="fl f12 lightblue-a lh200 pt5"]/p[3]/a/text()'
         )[0]
         doctor['department'] = sel.xpath(
             '//div[@class="fl f12 lightblue-a lh200 pt5"]/p[4]/a/text()'
         )[0]
         doctor['skill'] = sel.xpath(
             '//div[@id="goodat"]/text()')[0].replace('擅长:', '')
         doctor['introduce'] = sel.xpath(
             '//div[@id="person_info"]/text()')[0].replace('简介:', '')
         doctor['help_user'] = sel.xpath(
             '//div[@class=" f12 padd10 lh30"]/span[4]/a/text()')[0]
         doctor['good_comment'] = sel.xpath(
             '//div[@class=" f12 padd10 lh30"]/span[5]/a/text()')[0]
         doctor['crawl_time'] = datetime.datetime.now().strftime('%Y-%m-%d')
         Doctor.print_doctor(doctor)
         self.mysql.insert(table=self.zhuanjia_doctor_table, record=doctor)
         return doctor
     except Exception, e:
         print traceback.format_exc(), e.message
         FileIO.exceptionHandler(traceback.format_exc(), url=self.url)
         FileIO.writeToFile(self.url,
                            './../data/answer_doctor_url_error.csv')
         return None
Beispiel #9
0
 def parse(self):
     """
     处理某一天的url,这里将获取一天有多少页,并将这些页面的url保存到一个queue中,
     :return:
     """
     try:
         page_numer_content = self.selector.xpath(
             '//div[@class="club_page"]/div/text()')[-1]
         mode = re.compile(r'\d+')
         page_numer = (mode.findall(page_numer_content)[0]).encode('utf-8')
         page_numer = int(page_numer)
         print(self.url_count['url'], type(page_numer), page_numer)
         url_preffix = self.url_count['url'][0:len(self.url_count['url']) -
                                             6]
         # 成功获取页面数,将页面url保存到对应的queue中
         for number in range(1, page_numer + 1, 1):
             url = url_preffix + str(number) + '.html'
             print('page url:', url)
             url_count = {}
             url_count['url'] = url
             url_count['try_number'] = 0
             RabbitmqServer.add_message(
                 message=json.dumps(url_count),
                 routing_key=page_queue_exchange['routing_key'],
                 queue=page_queue_exchange['queue'],
                 queue_durable=page_queue_exchange['queue_durable'],
                 exchange=page_queue_exchange['exchange'],
                 exchange_type=page_queue_exchange['exchange_type'])
     except Exception as e:
         print(traceback.format_exc(), e.args[0], e.args[1])
         FileIO.exceptionHandler(message=traceback.format_exc() + ' ' +
                                 str(e.args[0]) + str(e.args[1]))
         # 本次失败且尝试次数小于10次,将这一天的url重新放回保存日期url的queue中,等待下一次尝试。
         if self.url_count['try_number'] <= url_try_number:
             RabbitmqServer.add_message(
                 message=json.dumps(self.url_count),
                 routing_key=day_queue_exchange['routing_key'],
                 queue=day_queue_exchange['queue'],
                 queue_durable=day_queue_exchange['queue_durable'],
                 exchange=day_queue_exchange['exchange'],
                 exchange_type=day_queue_exchange['exchange_type'])
         else:
             pass
Beispiel #10
0
 def get_connection(try_number=100):
     success = False
     for index in range(0, try_number, 1):
         print('get connection number', index)
         try:
             connection = pika.BlockingConnection(connection_parameter)
             success = True
             return connection
         except Exception as e:
             print(traceback.format_exc())
             FileIO.exceptionHandler(message=traceback.format_exc())
         if success == True:
             break
         else:
             pass
     if success == False:
         sys.exit(1)
     else:
         pass
Beispiel #11
0
    def callback(self, ch, method, properties, body):
        print('[X] get url: %s' % body)
        record = json.loads(body)
        print(json.dumps(record, indent=2))
        message = body.replace("/n", "")
        FileIO.writeToFile(text=message, filename="./../data/book_info.json")

        # Backup
        RabbitmqServer.add_message(
            message=body,
            routing_key=book_info_bkup_queue_exchange['routing_key'],
            queue=book_info_bkup_queue_exchange['queue'],
            queue_durable=book_info_bkup_queue_exchange['queue_durable'],
            exchange=book_info_bkup_queue_exchange['exchange'],
            exchange_type=book_info_bkup_queue_exchange['exchange_type'])

        ch.basic_ack(delivery_tag=method.delivery_tag)
        print('sleeping...')
        self.connection.sleep(time_sleep * 0.05)
Beispiel #12
0
 def add_message(message,
                 routing_key,
                 queue,
                 queue_durable=False,
                 exchange='',
                 exchange_type=None,
                 try_number=100):
     try:
         connection = RabbitmqServer.get_connection()
         channel = connection.channel()
         # 定义一个用来接收message的queue,同时为了保证消息不丢失,durable决定该queue进行持久化。
         channel.queue_declare(queue=queue, durable=queue_durable)
         # 定义一个exchange,用来传输message
         if exchange != '':
             channel.exchange_declare(exchange=exchange,
                                      exchange_type=exchange_type)
             channel.queue_bind(exchange=exchange,
                                routing_key=routing_key,
                                queue=queue)
             # 将message任务发送到服务器,同时为了保证每一个在做的message任务不丢失,delivery_mode参数决定将消息进行持久化。
             channel.basic_publish(
                 exchange=exchange,
                 routing_key=routing_key,
                 body=message,
                 properties=pika.BasicProperties(delivery_mode=2)
                 # make the message persistent.
             )
             print("[x] Sent %r" % message)
         else:
             # 将message任务发送到服务器,同时为了保证每一个在做的message任务不丢失,delivery_mode参数决定将消息进行持久化。
             channel.basic_publish(
                 exchange=exchange,
                 routing_key=queue,
                 body=message,
                 properties=pika.BasicProperties(delivery_mode=2)
                 # make the message persistent.
             )
             # print("[x] Sent %r" % message)
         connection.close()
     except Exception as e:
         print(traceback.format_exc())
         FileIO.exceptionHandler(message=traceback.format_exc())
Beispiel #13
0
 def __process_request__(self, request, timeout=100):
     """
     处理request请求,以string的类型返回网页内容。
     :param request:request的请求
     :param timeout:超时时间
     :return:返回内容,失败返回None
     """
     try:
         response = urllib.request.urlopen(request, timeout=timeout)
         doc = response.read()
         return doc
     except URLError as e:
         if hasattr(e, 'reason'):
             print('We failed to raach a server.')
             print('Reaseon: ', e.reason)
         elif hasattr(e, 'code'):
             print('The server could not fulfill the request.')
             print('Error code: ', e.code)
             print('Reason: ', e.reason)
         FileIO.exceptionHandler(message=e.reason)
         print(traceback.format_exc(), e.reason)
         return None
     except socket.timeout as e:
         print('Error code: socket timeout', e)
         FileIO.exceptionHandler(message=e.args[0])
         print(traceback.format_exc(), e.args[0])
         return None
     except Exception as e:
         FileIO.exceptionHandler(message=e.args[0])
         print(traceback.format_exc(), e.args[0])
         print('Do Not know what is wrong.')
         return None
Beispiel #14
0
 def __process_request__(self, request, timeout=100):
     """
     处理request请求,以string的类型返回网页内容。
     :param request:request的请求
     :param timeout:超时时间
     :return:返回内容,失败返回None
     """
     try:
         response = urllib2.urlopen(request, timeout=timeout)
         doc = response.read()
         return doc
     except URLError, e:
         if hasattr(e, 'reason'):
             print 'We failed to raach a server.'
             print 'Reaseon: ', e.reason
         elif hasattr(e, 'code'):
             print 'The server could not fulfill the request.'
             print 'Error code: ', e.code
             print 'Reason: ', e.reason
         FileIO.exceptionHandler(message=e.message)
         print traceback.format_exc(), e.message
         return None
Beispiel #15
0
 def __process_request_xpath__(self,
                               request,
                               timeout=100,
                               whether_decode=False,
                               encode_type='utf-8'):
     """
     处理request请求,返回一个可以使用xpath语法的数据类型。
     :param request: 需要处理的request。
     :param timeout:超时时间
     :param whether_decode: 是否需要转换编码
     :return:返回一个可以用xpath解析的selector格式。
     """
     try:
         response = urllib.request.urlopen(request, timeout=timeout)
         try:
             doc = response.read()
             response.close()
             if whether_decode == True:
                 doc = doc.decode(encode_type, 'ignore')
             else:
                 pass
             doc = lxml.etree.HTML(doc)
         except Exception as e:
             FileIO.exceptionHandler(message=e.args[0])
             print(traceback.format_exc(), e.args[0])
             doc = None
         return doc
     except URLError as e:
         if hasattr(e, 'reason'):
             print('We failed to raach a server.')
             print('Reaseon: ', e.reason)
         elif hasattr(e, 'code'):
             print('The server could not fulfill the request.')
             print('Error code: ', e.code)
             print('Reason: ', e.reason)
         FileIO.exceptionHandler(message=e.reason)
         print(traceback.format_exc(), e.reason)
         return None
     except socket.timeout as e:
         print('Error code: socket timeout', e)
         FileIO.exceptionHandler(message=e.args[0])
         print(traceback.format_exc(), e.args[0])
         return None
     except Exception as e:
         FileIO.exceptionHandler(message=e.args[0])
         print(traceback.format_exc(), e.args[0])
         print('Do Not know what is wrong.')
         return None
Beispiel #16
0
 def disease_count(self):
     client = MongoClient(self.host, self.port)
     db = client[self.db]
     question_collection = db[self.question_collection]
     disease_list = self.get_distinct_disease()
     index = 0
     for disease in disease_list:
         index = index + 1
         result = {}
         result['disease'] = disease
         pipeline = [{
             "$match": {
                 "disease_url": disease['disease_url']
             }
         }, {
             "$group": {
                 "_id": "$post_month",
                 "count": {
                     "$sum": 1
                 }
             }
         }, {
             "$sort": {
                 "_id": 1
             }
         }]
         count_list = list(question_collection.aggregate(pipeline))
         temp = {}
         for count in count_list:
             temp[count['_id']] = count['count']
         result['count'] = temp
         if index == 1:
             line = disease.keys() + temp.keys()
             FileIO.writeToCsvFile(list_msg=line,
                                   filename='./../result/disease_count.csv')
             line = disease.values() + temp.values()
             FileIO.writeToCsvFile(list_msg=line,
                                   filename='./../result/disease_count.csv')
         else:
             line = disease.values() + temp.values()
             FileIO.writeToCsvFile(list_msg=line,
                                   filename='./../result/disease_count.csv')
         print(index, result)
         FileIO.writeToFile(text=json.dumps(result),
                            filename='./../result/disease_count.json')
     client.close()
Beispiel #17
0
 def get_answer_doctor(self, url=None):
     if url == None:
         sel = self.process_url_request(url=self.url,
                                        whether_decode=True,
                                        encode_type='GBK')
     else:
         sel = self.process_url_request(url=url,
                                        whether_decode=True,
                                        encode_type='GBK')
     mode = re.compile(r'\d+')
     doctor = {}
     doctor['doctor_url'] = self.url
     try:
         profile_list = sel.xpath(
             '//ul[@class="fl bdul f14"]/li/span[2]/text()')
         doctor['name'] = profile_list[0]
         doctor['title'] = profile_list[1]
         doctor['department'] = profile_list[2]
         if len(
                 sel.xpath(
                     '//ul[@class="bdxli pt10 f12 clearfix black"]/li/span/text()'
                 )) == 5:
             doctor['grade'] = sel.xpath(
                 '//ul[@class="bdxli pt10 f12 clearfix black"]/li[1]/span/text()'
             )[0]
             doctor['best_reply'] = mode.findall(
                 sel.xpath(
                     '//ul[@class="bdxli pt10 f12 clearfix black"]/li[2]/span/text()'
                 )[0])[0]
             doctor['help_user'] = mode.findall(
                 sel.xpath(
                     '//ul[@class="bdxli pt10 f12 clearfix black"]/li[3]/span/text()'
                 )[0])[0]
             doctor['gratitude_user'] = \
             mode.findall(sel.xpath('//ul[@class="bdxli pt10 f12 clearfix black"]/li[5]/span/text()')[0])[0]
             doctor['fan'] = \
             mode.findall(sel.xpath('//ul[@class="bdxli pt10 f12 clearfix black"]/li[6]/span/text()')[0])[0]
             reputation_content = sel.xpath(
                 '//ul[@class="bdxli pt10 f12 clearfix black"]/li[4]/cite/img/@src'
             )
             doctor['reputation_type'] = reputation_content[0].split(
                 'ysmp/')[1].replace('.gif', '')
             doctor['reputation'] = len(reputation_content)
         else:
             pass
         doctor['skill'] = sel.xpath(
             '//div[@class="clearfix cl djzhan f12 mr10 pt10 pb10 none"]/p/text()'
         )[0].replace('擅长疾病:', '')
         doctor['hospital'] = sel.xpath(
             '//div[@class="clearfix cl djzhan f12 mr10 pt10 pb10 none"]/p/text()'
         )[1].replace('所在医院:', '')
         doctor['introduce'] = sel.xpath(
             '//div[@class="clearfix cl djzhan f12 mr10 pt10 pb10 none"]/div/text()'
         )[0].replace('个人简介:', '')
         doctor['crawl_time'] = datetime.datetime.now().strftime('%Y-%m-%d')
         print '================================================================='
         Doctor.print_doctor(doctor)
         self.mysql.insert(table=self.answer_doctor_table, record=doctor)
         return doctor
     except Exception, e:
         print traceback.format_exc(), e.message
         FileIO.exceptionHandler(traceback.format_exc(), url=self.url)
         FileIO.writeToFile(self.url,
                            './../data/answer_doctor_url_error.csv')
         return None
Beispiel #18
0
 def callback(self, ch, method, properties, body):
     question_file = '/mnt/qianlong/data/xywy/' + data_year + "/" + question_save_file
     print(type(body), body)
     FileIO.writeToFile(text=body, filename=question_file)
     ch.basic_ack(delivery_tag=method.delivery_tag)
Beispiel #19
0
class BaseSpider(object):
    """
    基础的爬虫类,实现user_agent的随机选取,从url到request再到需要的网页数据类型,可以转化成使用xpath提取的类型,也可以
    以string的类型获得网页源码。
    """

    # def __init__(self):
    #     pass

    def get_header(self):
        """
        获得头文件。
        :return:返回一个header。
        """
        return {'User-Agent': random.choice(user_agents)}

    def set_proxy(self):
        proxy_temp = {'http': random.choice(proxies)}
        proxy_handler = urllib2.ProxyHandler(proxy_temp)
        opener = urllib2.build_opener(proxy_handler)
        urllib2.install_opener(opener=opener)
        print '*********', proxy_temp

    def process_url_request(self,
                            url,
                            try_number=20,
                            timeout=100,
                            xpath_type=True,
                            whether_decode=False,
                            encode_type='utf-8',
                            use_proxy=False):
        """
        从一个url,返回该url对应的网页内容,根据需求不同,返回不同数据类型的网页数据。
        :param url: 目标url
        :param try_number: 尝试的次数
        :param timeout: 超时时间。
        :param xpath_type: 是否转化成可以使用xpath的数据类型。
        :param whether_decode: 是否需要转换编码。
        :param encode_type: 如果需要转换编码,则编码格式是什么。
        :param use_proxy: 是否使用代理服务器,boolean
        :return: 返回对应的数据,多次尝试失败后返回None.
        """
        doc = None
        try_index = 0
        if xpath_type == True:
            while doc == None:
                if use_proxy == True:
                    self.set_proxy()
                else:
                    pass
                request = urllib2.Request(url=url, headers=self.get_header())
                doc = self.__process_request_xpath__(
                    request=request,
                    timeout=timeout,
                    whether_decode=whether_decode,
                    encode_type=encode_type)
                try_index = try_index + 1
                if try_index > try_number:
                    break
                else:
                    pass
            return doc
        else:
            while doc == None:
                request = urllib2.Request(url=url, headers=self.get_header())
                doc = self.__process_request__(request=request,
                                               timeout=timeout)
                try_index = try_index + 1
                if try_index > try_number:
                    break
                else:
                    pass
            return doc

    def __process_request_xpath__(self,
                                  request,
                                  timeout=100,
                                  whether_decode=False,
                                  encode_type='utf-8'):
        """
        处理request请求,返回一个可以使用xpath语法的数据类型。
        :param request: 需要处理的request。
        :param timeout:超时时间
        :param whether_decode: 是否需要转换编码
        :return:返回一个可以用xpath解析的selector格式。
        """
        try:
            response = urllib2.urlopen(request, timeout=timeout)
            try:
                doc = response.read()
                response.close()
                if whether_decode == True:
                    doc = doc.decode(encode_type, 'ignore')
                else:
                    pass
                doc = lxml.etree.HTML(doc)
            except Exception, e:
                FileIO.exceptionHandler(message=e.message)
                print traceback.format_exc(), e.message
                doc = None
            return doc
        except URLError, e:
            if hasattr(e, 'reason'):
                print 'We failed to raach a server.'
                print 'Reaseon: ', e.reason
            elif hasattr(e, 'code'):
                print 'The server could not fulfill the request.'
                print 'Error code: ', e.code
                print 'Reason: ', e.reason
            FileIO.exceptionHandler(message=e.message)
            print traceback.format_exc(), e.message
            return None
Beispiel #20
0
                doc = None
            return doc
        except URLError, e:
            if hasattr(e, 'reason'):
                print 'We failed to raach a server.'
                print 'Reaseon: ', e.reason
            elif hasattr(e, 'code'):
                print 'The server could not fulfill the request.'
                print 'Error code: ', e.code
                print 'Reason: ', e.reason
            FileIO.exceptionHandler(message=e.message)
            print traceback.format_exc(), e.message
            return None
        except socket.timeout, e:
            print 'Error code: socket timeout', e
            FileIO.exceptionHandler(message=e.message)
            print traceback.format_exc(), e.message
            return None
        except Exception, e:
            FileIO.exceptionHandler(message=e.message)
            print traceback.format_exc(), e.message
            print 'Do Not know what is wrong.'
            return None

    def __process_request__(self, request, timeout=100):
        """
        处理request请求,以string的类型返回网页内容。
        :param request:request的请求
        :param timeout:超时时间
        :return:返回内容,失败返回None
        """
Beispiel #21
0
 def callback(self, ch, method, properties, body):
     question_file = detail_year_dir + "detail_question/" + question_save_file
     print(type(body), body)
     FileIO.writeToFile(text=body, filename=question_file)
     ch.basic_ack(delivery_tag=method.delivery_tag)
                                                encode_type="GBK",
                                                use_proxy=self.use_proxy)
            try:
                first['first_name'] = selector.xpath(
                    '//p[@class="pt5 pb5 lh180 f12 blue-a"]/a[3]/text()')[0]
                first['first_url'] = 'http://club.xywy.com' + selector.xpath(
                    '//p[@class="pt5 pb5 lh180 f12 blue-a"]/a[3]/@href')[0]
            except:
                first['first_name'] = selector.xpath(
                    '//li[@class="hd_family on"]/a/text()')[0]
                first['first_url'] = selector.xpath(
                    '//li[@class="hd_family on"]/a/@href')[0]
        except:
            first['first_name'] = ''
            first['first_url'] = ''
        return first


if __name__ == '__main__':
    client = MongoClient('localhost', 27017)
    db = client.xywy
    collection = db.question1
    url_list = collection.distinct("disease_url")
    for url in url_list:
        if url != u'':
            spider = FirstDepartment(url=url)
            first = spider.parse()
            print('**', first)
            FileIO.writeToFile(text=json.dumps(first),
                               filename='./../result/first_department.json')
    client.close()
Beispiel #23
0
 def get_family_doctor(self, url=None):
     if url == None:
         sel = self.process_url_request(url=self.url,
                                        whether_decode=True,
                                        encode_type='GBK')
     else:
         sel = self.process_url_request(url=url,
                                        whether_decode=True,
                                        encode_type='GBK')
     mode = re.compile(r'\d+')
     doctor = {}
     doctor['doctor_url'] = self.url
     try:
         name = sel.xpath('//h3[@class="fn clearfix cl"]/i/text()')
         if len(name) == 1:
             doctor['name'] = name[0].replace('医生个人主页', '')
             doctor['title'] = sel.xpath(
                 '//div[@class=" lh200 pt10 f14"]/text()')[0]
             doctor['hospital'] = sel.xpath(
                 '//div[@class=" lh200 pt10 f14"]/text()')[1].split('-')[0]
             doctor['department'] = sel.xpath(
                 '//div[@class=" lh200 pt10 f14"]/text()')[1].split('-')[1]
             doctor['skill'] = sel.xpath(
                 '//div[@class="clearfix"]/div[1][@class="HomeJie f14 fwei pt20"]/div/text()'
             )[0]
             introduce = sel.xpath(
                 '//div[@class="clearfix"]/div[2][@class="HomeJie f14 fwei pt20"]/div/text()'
             )
             if len(introduce) != 0:
                 doctor['introduce'] = introduce[0]
             else:
                 pass
             doctor['reputation'] = sel.xpath(
                 '//div[@class="clearfix mt20"]/span/text()')[0]
             help_content = sel.xpath(
                 '//div[@class="f14 fwei HomeHelp tc lh200 clearfix pt10"]/span/text()'
             )
             if len(help_content) == 2:
                 doctor['help_user'] = sel.xpath(
                     '//div[@class="f14 fwei HomeHelp tc lh200 clearfix pt10"]/span/text()'
                 )[0]
                 doctor['sign'] = sel.xpath(
                     '//div[@class="f14 fwei HomeHelp tc lh200 clearfix pt10"]/span/text()'
                 )[1]
             else:
                 doctor['help_user'] = sel.xpath(
                     '//div[@class="f14 fwei HomeHelp tc lh200 clearfix pt10"]/span/text()'
                 )[0]
             doctor['crawl_time'] = datetime.datetime.now().strftime(
                 '%Y-%m-%d')
         else:
             url = 'http://club.xywy.com/doc_card/' + self.url.split(
                 '/')[-1]
             self.url = url
             doctor = self.get_answer_doctor(url=url)
         Doctor.print_doctor(doctor)
         self.mysql.insert(table=self.family_doctor_table, record=doctor)
         return doctor
     except Exception, e:
         print traceback.format_exc(), e.message
         FileIO.exceptionHandler(traceback.format_exc(), url=self.url)
         FileIO.writeToFile(self.url,
                            './../data/answer_doctor_url_error.csv')
         return None