def parse(html,keywords,url_prefix): soup = BeautifulSoup(html,'html.parser'); header_list = soup.find_all('header',attrs={'class':'entry-header'}); results= {} for header in header_list: for child in header.descendants: if child.name=='a' and base_parser.isContainKeyword(keywords,child.text): if not base_parser.isContainKeyword('tags',child['href']): #titles results[child.text] = url_prefix + child['href'] else: #tags results[header.find('a').text] = url_prefix + header.find('a')['href'] return results
def parse(html, keywords, url_prefix): soup = BeautifulSoup(html, 'html.parser') header_list = soup.find_all('header', attrs={'class': 'entry-header'}) results = {} for header in header_list: for child in header.descendants: if child.name == 'a' and base_parser.isContainKeyword( keywords, child.text): if not base_parser.isContainKeyword('tags', child['href']): #titles results[child.text] = url_prefix + child['href'] else: #tags results[header.find( 'a').text] = url_prefix + header.find('a')['href'] return results
def parse(html,keywords,url_prefix): soup=BeautifulSoup(html,'html.parser') results= {} title_list = soup.find_all('td',attrs={'class':'title'}) for title in title_list: a = title.find('a'); if a is not None and base_parser.isContainKeyword(keywords,a.text) and a['href'].startswith('http'): results[a.text] = a['href'] return results
def parse(html, keywords, url_prefix): soup = BeautifulSoup(html, "html.parser") news_blocks = soup.find_all("div", class_="news_type_block") results = {} for block in news_blocks: h2_block = block.contents[1] title_block = h2_block.contents[1] if base_parser.isContainKeyword(keywords, title_block.text): results[base_parser.simplify_text(title_block.text)] = base_parser.get_url( url_prefix, title_block["href"], "/news" ) return results
def parse(html, keywords, url_prefix): soup = BeautifulSoup(html, 'html.parser') news_blocks = soup.find_all('div', class_='news_type_block') results = {} for block in news_blocks: h2_block = block.contents[1] title_block = h2_block.contents[1] if base_parser.isContainKeyword(keywords, title_block.text): results[base_parser.simplify_text( title_block.text)] = base_parser.get_url( url_prefix, title_block['href'], '/news') return results