def parse_url(offset): resp = requests.get(base_url, params={'page': offset}) print("解析:" + resp.url) result = [] if resp.status_code == 200: soup = t.get_bs(resp.content) tables = soup.select('table[width="100%%"]') for table in tables: a = table.find('a') detail_url = a['href'] # 歌曲详情页面 img_url = a.img['src'] # 图片url music_name = a.img['alt'] # 歌曲名 p = table.find('p') data_split = p.get_text().split("/") singer = data_split[0].strip() # 歌手 public_date = data_split[1].strip() category = "" # 分类 for data in data_split[2:]: category += data.strip() + "/" div = table.find('div', class_="star clearfix") score = div.select('span.rating_nums')[0].text # 评分 rate_count = rate_count_pattern.search( div.select('span.pl')[0].get_text()).group(0) # 评分人数 result.append([ img_url, music_name, singer, public_date, category, score, rate_count, detail_url ]) return result
def extract_text(url): report = "" resp = requests.get(news_url).content if resp is not None: soup = t.get_bs(resp) ps = soup.select('div#main_content p') for p in ps[:-1]: report += p.text return report
def catch_pic_diagrams(url): resp = requests.get(url).content if resp is not None: soup = t.get_bs(resp) # 拿标题建文件夹 title = soup.select("h1.article-title a")[0].text imgs = soup.select('article.article-content img') for img in imgs[:-1]: t.write_str_data(title + "~" + str(img['src']), file_save_path)
def get_page_count(): try: resp = requests.get(list_url, headers=headers, timeout=5) if resp is not None: soup = tools.get_bs(resp.text) page_count = int(soup.select('li.page-item')[-2].text) print("解析获得文章页数:" + str(page_count)) return page_count except Exception as e: print(str(e))
def get_article_url(url): try: resp = requests.get(url, headers=headers, timeout=5) if resp is not None: print("解析:" + resp.request.url) soup = tools.get_bs(resp.text) hrefs = soup.select('span.link_title a') for a in hrefs: tools.write_str_data(a['href'], articles_file) return None except Exception as e: print(str(e))
def catch_pic_diagrams_url(url): url_list = [] print("获取套图:" + url) resp = requests.get(url) if not resp.status_code == 404: if resp is not None: soup = t.get_bs(resp.content) article = soup.select("article.excerpt a.thumbnail") for a in article: url_list.append(a['href']) else: return None return url_list
def parse_url(offset): resp = requests.get(base_url, params={'page': offset}) print("解析:" + resp.url) result = [] if resp.status_code == 200: soup = t.get_bs(resp.content) tables = soup.select('table[width="100%%"]') for table in tables: a = table.find('a') detail_url = a['href'] # 歌曲详情页面 img_url = a.img['src'] # 图片url music_name = a.img['alt'] # 歌曲名 p = table.find('p') data_split = p.get_text().split("/") singer = data_split[0].strip() # 歌手 public_date = data_split[1].strip() category = "" # 分类 for data in data_split[2:]: category += data.strip() + "/" div = table.find('div', class_="star clearfix") score = div.select('span.rating_nums')[0].text # 评分 rate_count = rate_count_pattern.search(div.select('span.pl')[0].get_text()).group(0) # 评分人数 result.append([img_url, music_name, singer, public_date, category, score, rate_count, detail_url]) return result