class GetAnswerDoctor(object): def __init__(self, url_table, answer_doctor_table, family_doctor_table, zhuanjia_doctor_table): self.url_table = url_table self.answer_doctor_table = answer_doctor_table self.family_doctor_table = family_doctor_table self.zhuanjia_doctor_table = zhuanjia_doctor_table self.doctor_url = list() pass def get_url(self): self.mysql = MySQLDatabaseClass() self.doctor_url = self.mysql.select(table=self.url_table) return self.doctor_url def get_doctor_info(self): for url in self.doctor_url: doctor_url = url['doctor_url'] doctor = Doctor(url=doctor_url, answer_doctor_table=self.answer_doctor_table, family_doctor_table=self.family_doctor_table, zhuanjia_doctor_table=self.zhuanjia_doctor_table, mysql=self.mysql) if 'family' in doctor_url: doctor.get_family_doctor() pass elif 'doc_card' in doctor_url: # doctor.get_answer_doctor() pass elif 'z.xywy.com/doc' in doctor_url: # doctor.get_zhuanjia_doctor() pass
class IntoMySQL(object): def __init__(self, book_table, used_book_table): self.mysql = MySQLDatabaseClass() self.book_table = book_table self.used_book_table = used_book_table self.total_number = 0 self.drop_number = 0 def write_mysql(self, data_file_name): data_file = open(data_file_name, "r") for line in data_file: self.total_number += 1 book = json.loads(line) used_book_list = copy.deepcopy(book["used_book_list"]) book.pop("used_book_list") book.pop("used_book_page") try: book["number_of_used_crawled"] = len(used_book_list) except: book["number_of_used_crawled"] = 0 try: if book["amazon_price"] < book["lowest_used_price"]: line = json.loads(line) print("#############") print(json.dumps(line, indent=2)) except Exception as e: print(traceback.format_exc(), e.args[0]) # if book["ISBN_13"] is not None and book["ISBN_10"] is not None: # self.mysql.insert(table=self.book_table, record=book) # print "writing used books..." # if used_book_list is not None and len(used_book_list) > 0: # self._write_used_book(used_book_list=used_book_list, isbn_13=book["ISBN_13"],isbn_10=book["ISBN_10"]) # pass # else: # self.drop_number += 1 def _write_used_book(self, used_book_list, isbn_13, isbn_10): for index in range(0, len(used_book_list), 1): used_book = used_book_list[index] used_book["ISBN_index"] = isbn_10 + '-' + isbn_13 + '-' + str( index) used_book["ISBN_13"] = isbn_13 used_book["ISBN_10"] = isbn_10 self.mysql.insert(table=self.used_book_table, record=used_book)
def update_post(self): mysql = MySQLDatabaseClass() post_list = mysql.select(table='post') for post in post_list: post = self.get_post_detail(post) mysql.update(table='post', record=post, primary_key={'post_url': post['post_url']}) mysql.close()
def get_post(self): mysql = MySQLDatabaseClass() for url in self.target_url_list: post_list = self.get_post_url(url=url) for post in post_list: print post mysql.insert(table='post', record=post) mysql.close()
def into_database(file_name): data_file = open(file_name, 'r') file_name = file_name.split('/')[-1] mysql = MySQLDatabaseClass() index = 0 number = 0 for line in data_file: record = json.loads(line) record['file_name'] = file_name print(file_name, number, record) mysql.insert(table='question', record=record) index = index + 1 number = number + 1 if index > 100000: mysql.close() mysql = MySQLDatabaseClass() index = 0 else: pass data_file.close()
def update_doctor_info(): """ 跟新这次抓取的医生信息 :return: """ mysql = MySQLDatabaseClass() doctor_dict_list = mysql.select( table=table_info['doctor_communication'], record={'crawl_number': crawl_number}) for doctor_dict in doctor_dict_list: url = doctor_dict['doctor_url'] doctor = DoctorSpider(url=url, crawl_number=crawl_number) doctor_info = doctor.get_number() if doctor_info != None: mysql.update(table=table_info['doctor_communication'], record=doctor_info, primary_key={ 'doctor_url': url, 'crawl_number': doctor_dict['crawl_number'] }) print '医生信息已入库', url else: pass mysql.close()
def __init__(self, message, doctor_url_split, crawl_number): self.type = message['type'] self.post_comment = json.loads(message['content']) self.mysql = MySQLDatabaseClass() self.doctor_url_split = doctor_url_split self.crawl_number = crawl_number
class InsertDatabase(object): def __init__(self, message, doctor_url_split, crawl_number): self.type = message['type'] self.post_comment = json.loads(message['content']) self.mysql = MySQLDatabaseClass() self.doctor_url_split = doctor_url_split self.crawl_number = crawl_number def process(self): post = self.post_comment['post'] comment_list = self.post_comment['comment_list'] self.mysql.insert(table=table_info[self.type]['post'], record=post) print '***帖子信息已入库' index = 1 for comment in comment_list: self.mysql.insert(table=table_info[self.type]['comment_first'], record=comment['comment_first']) print '***第', str(index), '个一级评论已入库' index = index + 1 for comment_second in comment['comment_second_list']: parent_comment_list = self.mysql.select( table=table_info[self.type]['comment_first'], record=comment['comment_first']) if len(parent_comment_list) == 0: print 'Error, not find parent comment for:', comment_second pass else: parent_comment = parent_comment_list[0] comment_second[ 'parent_comment_doctor_url'] = parent_comment[ 'doctor_url'] comment_second[ 'parent_comment_comment_time'] = parent_comment[ 'comment_time'] comment_second[ 'parent_comment_comment_content'] = parent_comment[ 'comment_content'] self.mysql.insert( table=table_info[self.type]['comment_second'], record=comment_second) print '***二级评论已入库' pass self.insert_doctor_url(doctor_url_list=post['post_like_doctor_url'], split=True) self.insert_doctor_url(doctor_url_list=post['post_comment_doctor_url'], split=True) self.insert_doctor_url(doctor_url_list=[post['post_doctor_url']]) def insert_doctor_url(self, doctor_url_list, split=False): if split == True: doctor_url_list = doctor_url_list.split(self.doctor_url_split) else: pass for url in doctor_url_list: if 'club.xywy.com/doc_card' in url: doctor_info = {} doctor_info['doctor_url'] = url doctor_info['crawl_number'] = self.crawl_number print doctor_info self.mysql.insert(table=table_info['doctor_communication'], record=doctor_info) doctor_info['crawl_time'] = datetime.datetime.now().strftime( '%Y-%m-%d') self.mysql.insert(table=table_info['doctor_url'], record=doctor_info) @staticmethod def update_doctor_info(): """ 跟新这次抓取的医生信息 :return: """ mysql = MySQLDatabaseClass() doctor_dict_list = mysql.select( table=table_info['doctor_communication'], record={'crawl_number': crawl_number}) for doctor_dict in doctor_dict_list: url = doctor_dict['doctor_url'] doctor = DoctorSpider(url=url, crawl_number=crawl_number) doctor_info = doctor.get_number() if doctor_info != None: mysql.update(table=table_info['doctor_communication'], record=doctor_info, primary_key={ 'doctor_url': url, 'crawl_number': doctor_dict['crawl_number'] }) print '医生信息已入库', url else: pass mysql.close() def close(self): self.mysql.close()
def __init__(self, book_table, used_book_table): self.mysql = MySQLDatabaseClass() self.book_table = book_table self.used_book_table = used_book_table self.total_number = 0 self.drop_number = 0
def get_url(self): self.mysql = MySQLDatabaseClass() self.doctor_url = self.mysql.select(table=self.url_table) return self.doctor_url