def try_craw_info(fund_code, try_cnt): """ @param fund_code: @return: """ if try_cnt > 5: return None, None try: '''爬取页面,获得该基金的详细数据''' position_title_url = "http://fundf10.eastmoney.com/ccmx_" + str(fund_code[1:]) + ".html" print('第 {0} 次尝试,正在爬取基金 {1} 的详细数据中...'.format(try_cnt, fund_code[1:])) response_title = requests.get(url=position_title_url, headers={'User-Agent': get_ua()}, timeout=10) """爬取页面,获取该基金的持仓数据""" position_data_url = "http://fundf10.eastmoney.com/FundArchivesDatas.aspx?type=jjcc&code=" + \ str(fund_code[1:]) + "&topline=10&year=&month=&rt=" + str(random.uniform(0, 1)) print('第 {0} 次尝试,正在爬取基金 {1} 的持仓情况中...'.format(try_cnt, fund_code[1:])) # 解析基金的持仓情况 response_data = requests.get(url=position_data_url, headers={'User-Agent': get_ua()}, timeout=10) # 解析基金的详细数据 rank_detail_info = resolve_rank_detail_info(fund_code[1:], response_title) fund_positions_data = resolve_position_info(fund_code[1:], response_data.text) time.sleep(random.randint(2, 4)) except: time.sleep(random.randint(2*try_cnt, 4*try_cnt)) print("{0} 基金数据爬取失败,请注意!".format(str(fund_code[1:]))) rank_detail_info, fund_positions_data = try_craw_info(fund_code, try_cnt+1) return rank_detail_info, fund_positions_data
def get_rank_data(url, page_index, max_page, fund_type): """ 根据起始页码获取当前页面的所有基金情况 :return: """ try_cnt = 0 rank_data = [] # 若当前页其实页码小于总页数 或者 超时3次 则退出 while page_index<max_page and try_cnt<3: # 根据每页数据条数确定起始下标 new_url = url + '?ft=' + fund_type + '&sc=1n&st=desc&pi=' + str(page_index) + '&pn=100&fl=0&isab=1' print('正在爬取第 {0} 页数据:{1}'.format(page_index, new_url)) # 爬取当前页码的数据 response = requests.get(url=new_url, headers={'User-Agent': get_ua()}, timeout=10) if len(response.text) > 100: # 匹配数据并解析 res_data = re.findall("\[{1}\S+\]{1}", response.text)[0] # 解析单页数据 rank_pages_data = resolve_rank_info(res_data) rank_data.extend(rank_page_data for rank_page_data in rank_pages_data) else: try_cnt += 1 page_index += 1 # 随机休眠3-5 秒 time.sleep(random.randint(3, 5)) df_rank_data = pd.DataFrame(rank_data) return df_rank_data
def get_metro_info(id, cityname, name): """ 地铁线路信息获取 """ url = "http://map.amap.com/service/subway?_1618387860087&srhdata=" + id + '_drw_' + cityname + '.json' res = requests.get(url, headers={'User-Agent': get_ua()}) data = json.loads(res.text) df_data_city = pd.DataFrame() if data['l']: # 遍历每一条地铁线路 for data_line in data['l']: df_per_zd = pd.DataFrame(data_line['st']) df_per_zd = df_per_zd[['n', 'sl', 'poiid', 'sp']] df_per_zd['gd经度'] = df_per_zd['sl'].apply( lambda x: x.split(',')[0]) df_per_zd['gd纬度'] = df_per_zd['sl'].apply( lambda x: x.split(',')[1]) df_per_zd.drop('sl', axis=1, inplace=True) df_per_zd['路线名称'] = data_line['ln'] df_per_zd['城市名称'] = name df_per_zd.rename(columns={ 'n': '站点名称', 'sp': '拼音名称', 'poiid': 'POI编号' }, inplace=True) df_data_city = df_data_city.append(df_per_zd, ignore_index=True) return df_data_city
def get_city_list(): """ 获取拥有地铁的所有城市 @return: """ url = 'http://map.amap.com/subway/index.html' res = requests.get(url, headers={'User-Agent': get_ua()}) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text, 'html.parser') name_dict = [] # 获取显示出的城市列表 for soup_a in soup.find('div', class_='city-list fl').find_all('a'): city_name_py = soup_a['cityname'] city_id = soup_a['id'] city_name_ch = soup_a.get_text() name_dict.append({ 'name_py': city_name_py, 'id': city_id, 'name_ch': city_name_ch }) # 获取未显示出来的城市列表 for soup_a in soup.find('div', class_='more-city-list').find_all('a'): city_name_py = soup_a['cityname'] city_id = soup_a['id'] city_name_ch = soup_a.get_text() name_dict.append({ 'name_py': city_name_py, 'id': city_id, 'name_ch': city_name_ch }) df_name = pd.DataFrame(name_dict) return df_name
def get_question_base_info(url): """ 获取问题的详细描述 @param url: @return: """ response = requests.get(url=url, headers={'User-Agent': get_ua()}, timeout=10) # print(response.text.replace('\u200b', '').replace('\u2022', '')) """获取数据并解析""" soup = BeautifulSoup(response.text, 'lxml') # 问题标题 title = soup.find("h1", {"class": "QuestionHeader-title"}).text # 具体问题 question = '' try: question = soup.find("div", { "class": "QuestionRichText--collapsed" }).text.replace('\u200b', '') except Exception as e: print(e) # 关注者 follower = int( soup.find_all("strong", {"class": "NumberBoard-itemValue" })[0].text.strip().replace(",", "")) # 被浏览 watched = int( soup.find_all("strong", {"class": "NumberBoard-itemValue" })[1].text.strip().replace(",", "")) # 问题回答次数 answer_str = soup.find_all( "h4", {"class": "List-headerText"})[0].span.text.strip() # 抽取xxx 个回答中的数字:【正则】数字出现次数>=0 answer_count = int(re.findall('\d*', answer_str)[0]) # 问题标签 tag_list = [] tags = soup.find_all("div", {"class": "QuestionTopic"}) for tag in tags: tag_list.append(tag.text) return title, question, follower, watched, answer_count, tag_list
def get_position_data(data, rank): """ 根据起始页码获取当前页面的所有Top数据 @param data: @param rank: @return: """ """筛选Top数据""" data = data.replace('', np.NaN, regex=True) data_notna = data.dropna(subset=['近1年']) data_notna['近1年'] = data_notna['近1年'].astype(float) data_sort = data_notna.sort_values(by='近1年', ascending=False) data_rank = data_sort.loc[0:rank-1, :] # 爬取每个基金的数据 rank_detail_data = [] position_data = [] error_funds_list = [] for row_index, data_row in data_rank.iterrows(): fund_code = str(data_row['基金代码']) try: '''爬取页面,获得该基金的详细数据''' position_title_url = "http://fundf10.eastmoney.com/ccmx_" + str(fund_code[1:]) + ".html" print('正在爬取第 {0}/{1} 个基金 {2} 的详细数据中...'.format(row_index+1, len(data_rank), fund_code[1:])) response_title = requests.get(url=position_title_url, headers={'User-Agent': get_ua()}, timeout=10) # 解析基金的详细数据 rank_detail_info = resolve_rank_detail_info(fund_code[1:], response_title) """爬取页面,获取该基金的持仓数据""" position_data_url = "http://fundf10.eastmoney.com/FundArchivesDatas.aspx?type=jjcc&code=" + \ str(fund_code[1:]) + "&topline=10&year=&month=&rt=" + str(random.uniform(0, 1)) print('正在爬取第 {0}/{1} 个基金 {2} 的持仓情况中...'.format(row_index + 1, len(data_rank), fund_code[1:])) # 解析基金的持仓情况 response_data = requests.get(url=position_data_url, headers={'User-Agent': get_ua()}, timeout=10) fund_positions_data = resolve_position_info(fund_code[1:], response_data.text) # 保存数据 rank_detail_data.append(rank_detail_info) position_data.extend(fund_position_data for fund_position_data in fund_positions_data) except: error_funds_list.append(fund_code) print("{0} 数据爬取失败,稍后会进行重试,请注意!".format(str(fund_code[1:]))) # 随机休眠2-4 秒 time.sleep(random.randint(2, 4)) """爬取失败的进行重试""" for fund_info in error_funds_list: rank_detail_data_try, position_data_try = try_craw_info(fund_info, 1) if rank_detail_data_try == '': # 保存数据 rank_detail_data.append(rank_detail_data_try) position_data.extend(fund_position_data for fund_position_data in position_data_try) df_rank_detail_data = pd.DataFrame(rank_detail_data) df_position_data = pd.DataFrame(position_data) return df_rank_detail_data, df_position_data
def get_answer_info(url, index): """ 解析问题回答 @param url: @param index: @return: """ response = requests.get(url=url, headers={'User-Agent': get_ua()}, timeout=10) text = response.text.replace('\u200b', '') per_answer_list = [] try: question_json = json.loads(text) """获取当前页的回答数据""" print("爬取第{0}页回答列表,当前页获取到{1}个回答".format(index + 1, len(question_json["data"]))) for data in question_json["data"]: """问题的相关信息""" # 问题的问题类型、id、提问类型、创建时间、修改时间 question_type = data["question"]['type'] question_id = data["question"]['id'] question_question_type = data["question"]['question_type'] question_created = get_time_str(data["question"]['created']) question_updated_time = get_time_str( data["question"]['updated_time']) """答主的相关信息""" # 答主的用户名、签名、性别、粉丝数 author_name = data["author"]['name'] author_headline = data["author"]['headline'] author_gender = data["author"]['gender'] author_follower_count = data["author"]['follower_count'] """回答的相关信息""" # 问题回答id、创建时间、更新时间、赞同数、评论数、具体内容 id = data['id'] created_time = get_time_str(data["created_time"]) updated_time = get_time_str(data["updated_time"]) voteup_count = data["voteup_count"] comment_count = data["comment_count"] content = data["content"] per_answer_list.append([ question_type, question_id, question_question_type, question_created, question_updated_time, author_name, author_headline, author_gender, author_follower_count, id, created_time, updated_time, voteup_count, comment_count, content ]) except: print("Json格式校验错误") finally: answer_column = [ '问题类型', '问题id', '问题提问类型', '问题创建时间', '问题更新时间', '答主用户名', '答主签名', '答主性别', '答主粉丝数', '答案id', '答案创建时间', '答案更新时间', '答案赞同数', '答案评论数', '答案具体内容' ] per_answer_data = pd.DataFrame(per_answer_list, columns=answer_column) return per_answer_data