def parse(html,keywords,url_prefix):
	soup = BeautifulSoup(html,'html.parser');
	header_list = soup.find_all('header',attrs={'class':'entry-header'});
	results= {}
	for header in header_list:
		for child in header.descendants:
			if child.name=='a' and base_parser.isContainKeyword(keywords,child.text):	
				if not base_parser.isContainKeyword('tags',child['href']):
					#titles
					results[child.text] = url_prefix + child['href']		

				else:
					#tags
					results[header.find('a').text] = url_prefix + header.find('a')['href']		
	return results
def parse(html, keywords, url_prefix):
    soup = BeautifulSoup(html, 'html.parser')
    header_list = soup.find_all('header', attrs={'class': 'entry-header'})
    results = {}
    for header in header_list:
        for child in header.descendants:
            if child.name == 'a' and base_parser.isContainKeyword(
                    keywords, child.text):
                if not base_parser.isContainKeyword('tags', child['href']):
                    #titles
                    results[child.text] = url_prefix + child['href']

                else:
                    #tags
                    results[header.find(
                        'a').text] = url_prefix + header.find('a')['href']
    return results
Beispiel #3
0
def parse(html,keywords,url_prefix):
	soup=BeautifulSoup(html,'html.parser')
	results= {}
	title_list = soup.find_all('td',attrs={'class':'title'})
	for title in title_list:
		a = title.find('a');
		if a is not None and base_parser.isContainKeyword(keywords,a.text) and a['href'].startswith('http'):
			results[a.text] =  a['href']

	return results
def parse(html, keywords, url_prefix):
    soup = BeautifulSoup(html, "html.parser")
    news_blocks = soup.find_all("div", class_="news_type_block")
    results = {}
    for block in news_blocks:
        h2_block = block.contents[1]
        title_block = h2_block.contents[1]
        if base_parser.isContainKeyword(keywords, title_block.text):
            results[base_parser.simplify_text(title_block.text)] = base_parser.get_url(
                url_prefix, title_block["href"], "/news"
            )
    return results
def parse(html, keywords, url_prefix):
    soup = BeautifulSoup(html, 'html.parser')
    news_blocks = soup.find_all('div', class_='news_type_block')
    results = {}
    for block in news_blocks:
        h2_block = block.contents[1]
        title_block = h2_block.contents[1]
        if base_parser.isContainKeyword(keywords, title_block.text):
            results[base_parser.simplify_text(
                title_block.text)] = base_parser.get_url(
                    url_prefix, title_block['href'], '/news')
    return results