Exemple #1
0
def get_recommend_links(start_url):
    content = html.get_html_content(start_url, page_encoding='utf-8')
    div_recommend = content.find('div', 'page-recommend')
    all_a = div_recommend.find_all('a')
    home_recommend_links = []
    for a in all_a:
        home_recommend_links.append(a['href'])
    return home_recommend_links
Exemple #2
0
def get_link_by_page(url, max_page, output_file):
    case_num = 0

    for i in range(max_page):
        print('page', i)

        soup = html.get_html_content(url)

        # get all links
        links = []
        tds = soup.select(".black3")
        if tds is not None:
            for td in tds:
                href = td.a['href']
                if href is not None:
                    links.append(domain + td.a['href'])

        for j, link in enumerate(links):
            print('page', i, 'article', j)

            title, author, content = page.parser(link)

            if (title is not None) and (author is not None) and (content is not None):
                case_num += 1

                paper = {'title': title, 'author': author, 'content': content}
                # papers.append(paper)
                path = output_file + '.json'
                with open(path, 'a', encoding='utf-8') as file:
                    file.write(json.dumps(paper, ensure_ascii=False) + '\n')

                # path = "../resource/" + output_file + "/" + str(case_num) + ".txt"
                # with open(path, 'w', encoding='utf-8') as file:
                #     file.write(paper['title'])
                #     file.write('\n')
                #     file.write(paper['author'])
                #     file.write('\n')
                #     file.write(paper['content'])
                #     file.write('\n')
                #     file.close()

        # last page has no next page
        if i < max_page-1:
            # get next page url
            url = soup.find('a', text='下一页')['href']
            url = domain + url

    print('case number', case_num)
Exemple #3
0
def page(url):
    try:
        content = html.get_html_content(url)

        output = ''
        div = content.find('div', 'article')
        ps = div.find_all(['p'])

        for p in ps:
            for text in p.stripped_strings:
                output += text
            output += '\n'
    except:
        output = None

    return output
Exemple #4
0
def get_home_article(home_links, all_home_url, all_article_url, path, i):
    for home_link in home_links:
        if home_link in all_home_url:
            continue
        home_page = html.get_html_content(home_link)
        article_links = get_article_links(home_page) + get_hot_links(home_page)
        for article_link in article_links:
            if article_link in all_article_url:
                continue
            else:
                all_article_url.append(article_link)
                article = get_article(article_link)
                result(article, path, str(i))
                print(i)
                i += 1
    return i
Exemple #5
0
def parser(article_url):
    soup = html.get_html_content(article_url, page_encoding='gb18030')
    try:
        if soup is None:
            return None, None, None

        if soup.find('html') is None:
            print('not html page', article_url)
            return None, None, None

        title = soup.find('td', 'lblue3')
        if title is not None and title.string is not None:
            title = title.string.strip()
        else:
            title = None

        author = soup.find('td', 'grey4')
        if author is not None and author.string is not None:
            author = author.string.strip()
            ai1 = author.find('浏览')
            ai2 = author.find('时间')
            author = author[:ai1] + author[ai2:]
        else:
            author = None

        content = ''
        content_element = soup.find('td', 'black3')
        if content_element is not None:
            content_element = content_element.find_all(['p', 'div'])
            if content_element is not None:
                for element in content_element:
                    for text in element.stripped_strings:
                        content += text
                    content += '\n'

        if title is None or author is None or content is None:
            print('extract empty', article_url)

        # print(title)
        # print(author)
        # print(content)
    except:
        title = None
        author = None
        content = None

    return title, author, content