async def data_extraction_for_web(html): with async_timeout.timeout(10): try: url = html.find('a').get('href', None) if not url or 'baidu' in url or urlparse( url).netloc in BLACK_DOMAIN: return None netloc = urlparse(url).netloc is_parse = 1 if netloc in RULES.keys() else 0 title = html.select('font[size="3"]')[0].get_text() source = html.select('font[color="#008000"]')[0].get_text() time = re.findall(r'\d+-\d+-\d+', source) time = time[0] if time else None timestamp = 0 if time: try: time_list = [int(i) for i in time.split('-')] timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp except Exception as e: LOGGER.exception(e) timestamp = 0 return { 'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) return None
def extract_pre_next_chapter(chapter_url, html): """ 获取单章节上一页下一页 :param chapter_url: :param html: :return: """ next_chapter = OrderedDict() try: # 参考https://greasyfork.org/zh-CN/scripts/292-my-novel-reader next_reg = r'(<a\s+.*?>.*[上前下后][一]?[页张个篇章节步].*?</a>)' judge_reg = r'^[上前下后][一]?[页张个篇章节步]$' # 这里同样需要利用bs再次解析 next_res = re.findall(next_reg, html, re.I) str_next_res = '\n'.join(next_res) next_res_soup = BeautifulSoup(str_next_res, 'html5lib') for link in next_res_soup.find_all('a'): text = link.text or '' text = text.replace(' ', '') is_next = re.search(judge_reg, text) # is_ok = is_chapter(text) if is_next: url = urljoin(chapter_url, link.get('href')) or '' next_chapter[text] = url # nextDic = [{v[0]: v[1]} for v in sorted(next_chapter.items(), key=lambda d: d[1])] return next_chapter except Exception as e: LOGGER.exception(e) return next_chapter
async def fetch(client, url, name, is_web): with async_timeout.timeout(15): try: headers = {'user-agent': get_random_user_agent()} if is_web: params = { 'wd': name, 'ie': 'utf-8', 'rn': BAIDU_RN, 'vf_bl': 1 } else: params = {'word': name} async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def fetch(client, url): with async_timeout.timeout(10): try: headers = {'user-agent': get_random_user_agent()} async with client.get(url, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_phone(html): with async_timeout.timeout(10): try: # Get title data_log = eval(html['data-log']) url = data_log.get('mu', None) if not url: return None # Get title title = html.find('h3').get_text() # Get author and update_time (option) novel_mess = html.findAll(class_='c-gap-right-large') basic_mess = [i.get_text() for i in novel_mess] if novel_mess else None return {'title': title, 'url': url, 'basic_mess': basic_mess} except Exception as e: LOGGER.exception(e) return None
async def get_real_url(client, url): with async_timeout.timeout(10): try: headers = {'user-agent': get_random_user_agent()} async with client.get(url, headers=headers, allow_redirects=True) as response: assert response.status == 200 LOGGER.info('Parse url: {}'.format(response.url)) # text = "" # try: # text = await response.text() # except: # text = await response.read() # if text: # print(text) # text = re.findall(r'replace\(\"(.*?)\"\)', str(text)) # text = text[0] if text[0] else "" url = response.url if response.url else None return url except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_web_baidu(client, html): with async_timeout.timeout(20): try: # 获取一段h5 url = html.select('h3.t a')[0].get('href', None) # 获取href标签内容 - # 首先 - html.select('h3.t a')[0] # < a # data - click = "{ # 'F': '778317EA', # 'F1': '9D73F1E4', # 'F2': '4DA6DD6B', # 'F3': '54E5363F', # 'T': '1497109660', # 'y': 'A7BFAFBF' # # }" href=" # http: // www.baidu.com / link?url = w2bV3ST9FFL3f39PGG6VUhT10aqZ1GNZrhWa5BIclVak7hYZEh1wGiTsvrGYJgXJEAPNoPfS7x0X4xK9nLDzJK # " target=" # _blank # "><em>择天记</em>,<em>择天记</em>最新章节,<em>择天记</em>无弹窗,88读书网</a> # 然后 - get('href', None) # http: // www.baidu.com / link?url = w2bV3ST9FFL3f39PGG6VUhT10aqZ1GNZrhWa5BIclVak7hYZEh1wGiTsvrGYJgXJEAPNoPfS7x0X4xK9nLDzJK real_url = await get_real_url(client=client, url=url) if url else None if real_url: netloc = urlparse(str(real_url)).netloc # >> > import urlparse # >> > url = urlparse.urlparse('http://www.baidu.com/index.php?username=guol') # >> > print # url # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.php', params='', query='username=guol', # fragment='') # >> > print # url.netloc # www.baidu.com if 'baidu' in str(real_url) or netloc in BLACK_DOMAIN: return None #print('--------------------') #print(RULES.keys()) is_parse = 1 if netloc in RULES.keys() else 0 title = html.select('h3.t a')[0].get_text() # time = re.findall(r'\d+-\d+-\d+', source) # time = time[0] if time else None timestamp = 0 time = "" # if time: # try: # time_list = [int(i) for i in time.split('-')] # timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp # except Exception as e: # LOGGER.exception(e) # timestamp = 0 return { 'title': title, 'url': str(real_url).replace('index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc } else: return None except Exception as e: LOGGER.exception(e) return None