def extract_woman(html): soup = BeautifulSoup(html, 'lxml') for node in soup('div', {'class': 'article-info'}): node.extract() for node in soup('div', {'class': 'article__tags'}): node.extract() content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'class': 'article__lead-paragraph'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'itemprop': 'articleBody'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'class': 'container__content-text'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'class': 'card__comment'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_pikabu(html): soup = BeautifulSoup(html, 'lxml') for node in soup('div', {'class': 'sidebar'}): node.extract() content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'class': 'story__description'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'class': 'story__content'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'class': 'comment__content'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_article(html): if not len(html.strip()): return '' soup = BeautifulSoup(html, 'lxml') header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) article = html_to_article(html, 'ru') text = fragment_to_text('<div>' + header + '<br>' * 3 + article + '</div>') return text
def extract_sport(html): soup = BeautifulSoup(html, 'lxml') content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'class': 'article_text'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_ask(html): soup = BeautifulSoup(html, 'lxml') for node in soup('p', {'class': 'readMore'}): node.extract() content = [] for node in soup('header', {'class': 'streamItem_header'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'class': 'streamItem_content'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_otvet(html): if '</head>' in html and '</body>' in html: soup = BeautifulSoup(html, 'lxml') content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'itemprop': 'text'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content) else: return html
def extract_lurk(html): if 'В базе данных не найдено' in html: return '' soup = BeautifulSoup(html, 'lxml') for node in soup('table', {'class': 'lm-plashka'}): node.extract() for node in soup('table', {'id': 'toc'}): node.extract() for node in soup('div', {'class': 'buttons-line'}): node.extract() for node in soup('div', {'class': 'noprint'}): node.extract() for node in soup(None, {'class': 'mw-collapsible'}): node.extract() content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for bad_title in [ 'User:'******'Mediawiki:', 'Special:', 'Lurkmore:', 'Участник:', 'Служебная:', 'Обсуждение:', 'Категория:', 'Портал:', 'Обсуждение портала:', 'Шаблон:', 'Обсуждение участника:', 'Файл:', 'Обсуждение категории:', 'Обсуждение шаблона:', 'Обсуждение копипасты:', 'Обсуждение смехуечков:', 'Обсуждение файла:', 'Смехуечки:', 'Обсуждение MediaWiki:' ]: if bad_title in header: return '' for node in soup('div', {'id': 'mw-content-text'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_irec(html): soup = BeautifulSoup(html, 'lxml') content = [] header = [str(node) for node in soup('h2', {'class': 'reviewTitle'})] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'itemprop': 'reviewBody'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'class': 'cmntreply-text'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_mk(html): soup = BeautifulSoup(html, 'lxml') content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'itemprop': 'description'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'itemprop': 'articleBody'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_habr(html): soup = BeautifulSoup(html, 'lxml') content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'class': 'post__text'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'class': 'comment__message'}): content.append(str(node)) content.append('<br>' * 3) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_rbc(html): soup = BeautifulSoup(html, 'lxml') for node in soup('div', {'class': 'article__main-image'}): node.extract() content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'class': 'article__header__subtitle'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'itemprop': 'articleBody'}): content.append(str(node)) content.append('<br>' * 10) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)
def extract_lj(html): comments = [] try: meta_text = re.findall(r'\sSite\.page = (\{")([\s\S]+?)(\});\s+Site', html) if len(meta_text) == 1 and len(meta_text[0]) == 3: meta_dict = json.loads(''.join(meta_text[0])) if 'comments' in meta_dict: for c in meta_dict['comments']: if 'uname' not in c or 'bot' in c['uname']: continue if 'article' not in c or not c['article']: continue comments.append(c['article']) except Exception as e: print(e) if len(comments): comments = '<br><br><br>'.join(comments) comments = fragment_to_text(comments) else: comments = '' return extract_article(html) + '\n\n\n' + comments
def extract_drive2(html): soup = BeautifulSoup(html, 'lxml') content = [] header = [str(node) for node in soup('h1')] header = '<br><br><br>'.join(header) content.append(header) for node in soup('div', {'itemprop': 'articleBody'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'itemprop': 'reviewBody'}): content.append(str(node)) content.append('<br>' * 10) for node in soup('div', {'class': 'c-comment__text'}): content.append(str(node)) content.append('<br>' * 3) content = '<div>{}</div>'.format(''.join(content)) return fragment_to_text(content)