Example #1
0
def parse_an_article(link_queue=QueueManager.list_init(),
                     link_visited=isVisited.init()):
    target_url = link_queue.get()
    if target_url in link_visited or target_url in stoplist:
        return link_queue, link_visited
    # html = open('test.html', encoding='utf8').read()
    print(target_url)
    html = load_html(target_url)
    link_visited[target_url] = True

    try:
        soup = BeautifulSoup(html, 'html.parser')
    except:
        return link_queue, link_visited

    # get date
    date = None
    try:
        time_stamp = int(soup.find(id='news-time')['data-val']) / 1000
        date = datetime.datetime.fromtimestamp(time_stamp)
        # time is scaled at seconds
    except:
        try:
            raw_date = soup.find(id='pubtime_baidu')['content']
            date = datetime.datetime.strptime(raw_date,
                                              "%Y-%m-%dT%H:%M:%S+08:00")
        except:
            try:
                raw_date = soup.find(id='pubtime_baidu').string
                date = datetime.datetime.strptime(raw_date,
                                                  "%Y-%m-%d %H:%M:%S")
            except:
                pass

    byte_title = soup.title.string

    # now parsing the body part
    byte_content = ""
    try:
        article = soup.find('article')
        for string in article.strings:
            byte_content += string
        # byte_content = byte_content.encode('utf8')
    except:
        article = soup.find(itemprop='articleBody')
        try:
            descendants = article.descendants
            for tag in article.descendants:
                if tag.name == 'p':
                    # byte_content += tag.string
                    try:
                        byte_content += tag.string
                    except:
                        try:
                            byte_content += tag.br.string
                        except:
                            pass
                    byte_content += '\n'
        except:
            pass
    # parsing finished

    for raw_link in soup.find_all('a'):
        filtered_addtolist(raw_link.get('href'), link_queue, link_visited)

    page = Webpage(target_url, byte_title, date, byte_content)
    date_filename = space_to_dash(str(date))
    database.save(date_filename, byte_content)
    print(date_filename)
    return link_queue, link_visited
Example #2
0
            pass
    # parsing finished

    for raw_link in soup.find_all('a'):
        filtered_addtolist(raw_link.get('href'), link_queue, link_visited)

    page = Webpage(target_url, byte_title, date, byte_content)
    date_filename = space_to_dash(str(date))
    database.save(date_filename, byte_content)
    print(date_filename)
    return link_queue, link_visited


if __name__ == '__main__':
    while True:
        link_queue = QueueManager.list_init()
        link_visited = isVisited.init()
        for iter_count in tqdm(range(MAX_ITERATION)):
            link_queue, link_visited\
                = parse_an_article(link_queue, link_visited)
        print("now start saving")
        QueueManager.list_save(link_queue)
        isVisited.save(link_visited)
        print("saving complete")
        while True:
            user_input = input("continue?\n(y/n)")
            if user_input == 'y':
                break
            if user_input == 'n':
                exit(0)