def craw_info(self): ''' 爬取官员数据 :return: ''' global temp, counter while True: if len(self.may_officer_info) == 0: break officer_data = self.may_officer_info.pop(0) raw_data = get_html_by_url(officer_data[2]) if raw_data == None: continue soup = BeautifulSoup(raw_data, 'html.parser') is_officer, filter_info = self.officer_filter(soup, officer_data) if not is_officer: print('Not Officer: ', filter_info, officer_data) # self.other_csv_writer.writerow(officer_data) continue self.add_officer_info(soup, officer_data) threadLock_init.acquire() counter -= 1 if counter == 0: self.out.close() self.out_other() threadLock_init.release()
def fetch_person_index(key, part_url, index_list): if part_url is None: return pro_url = 'http://ldzl.people.com.cn/dfzlk/front/' + part_url try: raw_data = get_html_by_url(pro_url) except: print('province url is error: ', pro_url) soup = BeautifulSoup(raw_data, 'html.parser') div_tag = soup.find('div', class_='fr p2j_reports_right title_2j sjzlk') city_h2_tags = div_tag.find_all('h2') city_div_tags = div_tag.find_all('div', class_='zlk_list') name_set = set() for i in range(len(city_h2_tags)): city_name = re.sub(r'\n', '', city_h2_tags[i].get_text()) li_tags = city_div_tags[i].find_all('li') if len(li_tags) == 0: continue for li_tag in li_tags: district = del_content_blank(li_tag.find('span').get_text()) person_name = li_tag.find('em').get_text() if person_name not in name_set: name_set.add(person_name) person_index = (person_name, key, city_name, district) print(person_index) index_list.append(person_index) return
def get_lemmid_and_pic_url(baike_url): try: raw_data = get_html_by_url(baike_url) except: print('baike url is error: ', baike_url) soup = BeautifulSoup(raw_data, 'html.parser') lemmid = soup.find( 'div', class_='lemmaWgt-promotion-rightPreciseAd').get('data-lemmaid') pic_div = soup.find('div', class_='summary-pic') pic_url = '' if pic_div is not None: pic_url = pic_div.find('img').get('src') return lemmid, pic_url
def add_may_officer_info(self, lemmaId): ''' 添加官员的相关人物信息 :param lemmaId: 要添加相关人物的官员id :return: 相关人物链接 ''' global total_count, success_count total_count += 1 json_url = 'https://baike.baidu.com/wikiui/api/zhixinmap?lemmaId=' + lemmaId raw_data = get_html_by_url(json_url) if raw_data == None: print("获取相关链接:" + str(success_count) + '/' + str(total_count)) print('may officer info is None') return [] json_data = json.loads(str(raw_data, encoding='utf-8')) if not isinstance(json_data, list): print("获取相关链接:" + str(success_count) + '/' + str(total_count)) print('json is false') return [] relative_links = [] for item1 in json_data: if item1['tipTitle'].find('人物') < 0 and item1['tipTitle'].find( '学者') < 0: continue data = item1['data'] for item in data: name = item['title'] url = item['url'] pic = item['pic'] lemmaid = item['lemmaId'] relative_links.append(url) threadLock_id_set_and_may_officer.acquire() if int(lemmaid) not in self.may_id_set: self.may_id_set.add(int(lemmaid)) off_info = [lemmaid, name, url, pic] self.may_officer_info.append(off_info) threadLock_id_set_and_may_officer.release() print('add may officer info Success!') success_count += 1 print("获取相关链接:" + str(success_count) + '/' + str(total_count)) return relative_links
def get_index_list(): url = 'http://ldzl.people.com.cn/dfzlk/front/xian35.htm' raw_data = get_html_by_url(url) soup = BeautifulSoup(raw_data, 'html.parser') li_tags = soup.find('div', class_='fl p2j_reports_left').find_all('li') province_dict = {} for li_tag in li_tags: a_tag = li_tag.find('a') href = a_tag.get('href') if href == '#': continue province_dict[a_tag.get_text()] = href index_list = [] for key in province_dict.keys(): fetch_person_index(key, province_dict[key], index_list) return index_list
def get_person_baike_url(row): name, province, city, district = row url_list = list() name_url = "https://baike.baidu.com/search/word?word={0}".format( urllib.request.quote(name)) html = get_html_by_url(name_url) soup = BeautifulSoup(html, 'html.parser') items = soup.select( 'body > div.body-wrapper > div.before-content > div > ul > li') for item in items: text = item.getText()[1:] if province in text or city in text or district in text: try: new_name_url = "https://baike.baidu.com" + item.a['href'] url_list.append(new_name_url) except AttributeError: url_list.append(name_url) except TypeError: url_list.append(name_url) return url_list
def update_head_image(): """ 该函数主要是更新数据库officer_message表中的数据,主要是对头像图片(head_image)字段的更新, 根据 数据库已存的头像链接字段(head_image_url),爬取图片并将已二进制写入数据库,更新字段 头像图片(head_image) :return: 无返回内容 """ Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6') select_sql = "SELECT id_index, head_image_url FROM crawler.officer_message WHERE officer_message.head_image ='';" update_sql = "UPDATE crawler.officer_message SET head_image = {0} WHERE id_index = '{1}';" cur = Connection.cursor() cur.execute(select_sql) for line in cur.fetchall(): id_index, head_image_url = line head_image = get_html_by_url(head_image_url) finish_update_sql = update_sql.format(psycopg2.Binary(head_image), id_index) print(finish_update_sql) cur.execute(finish_update_sql) Connection.commit() print(id_index, "的头像已存入") cur.close() Connection.close()
if __name__ == '__main__': f_in = open('../data/官员信息.csv', newline='', encoding='utf-8') csv_reader = csv.reader(f_in) f_out = open('../data/官员信息new.csv', 'w', newline='', encoding='utf-8') csv_writer = csv.writer(f_out, dialect='excel') i = 0 for row in csv_reader: sleep(1) i += 1 print(i) lemmaId = row[0] json_url = 'https://baike.baidu.com/wikiui/api/zhixinmap?lemmaId=' + lemmaId raw_data = get_html_by_url(json_url) if raw_data == None: print('may officer info is None') csv_writer.writerow(row) continue json_data = json.loads(str(raw_data, encoding='utf-8')) if not isinstance(json_data, list): print('json is false: ', json_data) csv_writer.writerow(row) continue print('find relative links Success!') off_infos = [] for item1 in json_data: