def main(): cookie = get_cookie() url_record = UrlRecord() current_url_id = get_current_url_id() while current_url_id < 1000: person_url = url_record.get(current_url_id) if person_url is None: break url = person_url[0] name = person_url[1] file_name = build_file_name(current_url_id, url, name) fp = open(file_name, 'w') # 将用户的微博写入文件中 people = PeoplePage(url, cookie) total_num = people.get_blog_page_total_num() for i in range(1, total_num + 1): blog_list = people.get_blog_list(i) for blog in blog_list: fp.write(blog[0]) fp.write('\t') fp.write(blog[1]) fp.write('\t') fp.write(blog[2]) fp.write('\t') fp.write('\n') print("%s Complete %d" % (name, i * 100 / total_num)) fp.close() # 将当前用户的关注用户写入数据库 care_page_url = people.get_care_people_page_url() print care_page_url if care_page_url == '': break care_people_page = CarePeoplePage(care_page_url, cookie) total_num = care_people_page.get_page_total_num() for i in range(1, total_num + 1): people_list = care_people_page.get_people_list(i) for people in people_list: url_record.add(people[1].decode('utf-8'), people[0].decode('utf-8')) current_url_id += 1 set_current_url_id(current_url_id)
def main(): fp_male = open('./male.txt', 'a', encoding='utf-8') fp_female = open('./female.txt', 'a', encoding='utf-8') cookie_index = 1 url_record = UrlRecord() current_url_id = get_current_url_id() sleep_time = 4 while True: print('current cookie index is', cookie_index) cookie = get_cookie(cookie_index) print('current url id is', current_url_id) people_url = url_record.get(current_url_id) if people_url is None: print("people url is None") break url = people_url[0] people = PeoplePage(url, cookie) time.sleep(sleep_time) state = people.get_state() if state: if state.find('HTTP Error 403: Forbidden') != -1: break info_url = people.get_people_info_page_url() if info_url is None: print('info url is None') current_url_id += 1 set_current_url_id(current_url_id) continue print(info_url) info_page = PeopleInfoPage(info_url, cookie) time.sleep(sleep_time) info = info_page.get_people_info() # print(info) if 'name' in info and 'gender' in info: name = info['name'] # print(name) if info['gender'] == u'男': fp_male.write(name) fp_male.write('\n') fp_male.flush() elif info['gender'] == u'女': fp_female.write(name) fp_female.write('\n') fp_female.flush() # 将当前用户的关注用户写入数据库 care_page_url = people.get_care_people_page_url() if care_page_url is None: print('care page url is None') current_url_id += 1 set_current_url_id(current_url_id) continue care_people_page = CarePeoplePage(care_page_url, cookie) time.sleep(sleep_time) people_list = care_people_page.get_people_list(1) time.sleep(sleep_time) for people in people_list: url_record.add(people[1], people[0]) current_url_id += 1 set_current_url_id(current_url_id) fp_male.close() fp_female.close()