Ejemplo n.º 1
0
def get_topic_words(topic):
    html_content = get_topic_page(topic)

    more_link_word = get_some_more_links_word(html_content)

    if len(more_link_word):
        #Ограничение тремя ссылками, чтобы быстрее было
        for word in more_link_word[0:3]:
            html_content += get_topic_page(word)

    words = re.findall("[а-яА-я\-\']{3,}", html_content)
    return words
Ejemplo n.º 2
0
def get_wiki_links(link):
    html_content = get_topic_page(link)
    soup = BS(html_content, 'html.parser')
    links = soup.find_all("a")
    links = [link.get('href', '') for link in links]
    links = [
        link for link in links
        if re.search('/wiki/', link) and not re.search('./wiki/', link)
    ]
    return links
Ejemplo n.º 3
0
def get_topic_tables(topic):
    html_content = get_topic_page(topic)
    soup = BS(html_content, "html.parser")
    tables = soup.find_all("table")
    tbs = soup.select("table.standard")
    for t in tbs:
        trs = t.select("tr")
        print(len(trs))
    hrs = [t.get("class", "") for t in tables]
    print(hrs)
    return hrs
Ejemplo n.º 4
0
def get_topic_text(topic):
    html_content = get_topic_page(topic)
    words = re.findall("[а-яА-Я\-\']+", html_content)
    text = " ".join(words)
    return text
Ejemplo n.º 5
0
def get_topic_words(topic):
    html_content = get_topic_page(topic)
    words = re.findall("[а-яА-Я\-\']+", html_content)
    return words
Ejemplo n.º 6
0
def get_topic_words(link):
    html_content = get_topic_page(link)
    words = re.findall("[а-яА-Я\-']{3,}", html_content)
    return words
Ejemplo n.º 7
0
def get_topic_words(topic):
    html_content = get_topic_page(topic)
    words = re.findall("[а-яёА-Я\-\']{3,}", html_content)
    #text = " ".join(words)
    return words
Ejemplo n.º 8
0
def get_topic_links(topic):
    html_content = get_topic_page(topic)
    soup = BS(html_content, "html.parser")
    links = soup.find_all("tr")
    print(links)
Ejemplo n.º 9
0
def get_neighbo_pages(topic):
    nlinks = get_neighbo_links(topic)
    html_pages = [get_topic_page(n) for n in nlinks]
    return html_pages
Ejemplo n.º 10
0
def get_topic_words(topic):
    html_content = get_topic_page(topic)
    words = re.findall(r'[а-яА-Я][а-яА-Я\-\']+[а-яА-Я]', html_content)
    return [
        w.capitalize() for w in words
    ]  # Добавил капитализацию, потому что Дерево и дерево считались разными
Ejemplo n.º 11
0
def get_topic_links(topic):
    html_content = get_topic_page(topic)
    soup = BS(html_content, "html.parser")
    links = soup.find_all("a")
    hrefs = [n.get("href", "") for n in links]
    return hrefs
Ejemplo n.º 12
0
def get_topic_words(topic):
    html_content = get_topic_page(topic)
    words = re.findall("[а-яА-Я\-\']+", html_content)
    # слова, в которых более 3-х букв
    words = re.findall("[а-яА-Я\-\']{3,}", html_content)
    return words