def get_topic_words(topic): html_content = get_topic_page(topic) more_link_word = get_some_more_links_word(html_content) if len(more_link_word): #Ограничение тремя ссылками, чтобы быстрее было for word in more_link_word[0:3]: html_content += get_topic_page(word) words = re.findall("[а-яА-я\-\']{3,}", html_content) return words
def get_wiki_links(link): html_content = get_topic_page(link) soup = BS(html_content, 'html.parser') links = soup.find_all("a") links = [link.get('href', '') for link in links] links = [ link for link in links if re.search('/wiki/', link) and not re.search('./wiki/', link) ] return links
def get_topic_tables(topic): html_content = get_topic_page(topic) soup = BS(html_content, "html.parser") tables = soup.find_all("table") tbs = soup.select("table.standard") for t in tbs: trs = t.select("tr") print(len(trs)) hrs = [t.get("class", "") for t in tables] print(hrs) return hrs
def get_topic_text(topic): html_content = get_topic_page(topic) words = re.findall("[а-яА-Я\-\']+", html_content) text = " ".join(words) return text
def get_topic_words(topic): html_content = get_topic_page(topic) words = re.findall("[а-яА-Я\-\']+", html_content) return words
def get_topic_words(link): html_content = get_topic_page(link) words = re.findall("[а-яА-Я\-']{3,}", html_content) return words
def get_topic_words(topic): html_content = get_topic_page(topic) words = re.findall("[а-яёА-Я\-\']{3,}", html_content) #text = " ".join(words) return words
def get_topic_links(topic): html_content = get_topic_page(topic) soup = BS(html_content, "html.parser") links = soup.find_all("tr") print(links)
def get_neighbo_pages(topic): nlinks = get_neighbo_links(topic) html_pages = [get_topic_page(n) for n in nlinks] return html_pages
def get_topic_words(topic): html_content = get_topic_page(topic) words = re.findall(r'[а-яА-Я][а-яА-Я\-\']+[а-яА-Я]', html_content) return [ w.capitalize() for w in words ] # Добавил капитализацию, потому что Дерево и дерево считались разными
def get_topic_links(topic): html_content = get_topic_page(topic) soup = BS(html_content, "html.parser") links = soup.find_all("a") hrefs = [n.get("href", "") for n in links] return hrefs
def get_topic_words(topic): html_content = get_topic_page(topic) words = re.findall("[а-яА-Я\-\']+", html_content) # слова, в которых более 3-х букв words = re.findall("[а-яА-Я\-\']{3,}", html_content) return words