def parse_an_article(link_queue=QueueManager.list_init(), link_visited=isVisited.init()): target_url = link_queue.get() if target_url in link_visited or target_url in stoplist: return link_queue, link_visited # html = open('test.html', encoding='utf8').read() print(target_url) html = load_html(target_url) link_visited[target_url] = True try: soup = BeautifulSoup(html, 'html.parser') except: return link_queue, link_visited # get date date = None try: time_stamp = int(soup.find(id='news-time')['data-val']) / 1000 date = datetime.datetime.fromtimestamp(time_stamp) # time is scaled at seconds except: try: raw_date = soup.find(id='pubtime_baidu')['content'] date = datetime.datetime.strptime(raw_date, "%Y-%m-%dT%H:%M:%S+08:00") except: try: raw_date = soup.find(id='pubtime_baidu').string date = datetime.datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S") except: pass byte_title = soup.title.string # now parsing the body part byte_content = "" try: article = soup.find('article') for string in article.strings: byte_content += string # byte_content = byte_content.encode('utf8') except: article = soup.find(itemprop='articleBody') try: descendants = article.descendants for tag in article.descendants: if tag.name == 'p': # byte_content += tag.string try: byte_content += tag.string except: try: byte_content += tag.br.string except: pass byte_content += '\n' except: pass # parsing finished for raw_link in soup.find_all('a'): filtered_addtolist(raw_link.get('href'), link_queue, link_visited) page = Webpage(target_url, byte_title, date, byte_content) date_filename = space_to_dash(str(date)) database.save(date_filename, byte_content) print(date_filename) return link_queue, link_visited
pass # parsing finished for raw_link in soup.find_all('a'): filtered_addtolist(raw_link.get('href'), link_queue, link_visited) page = Webpage(target_url, byte_title, date, byte_content) date_filename = space_to_dash(str(date)) database.save(date_filename, byte_content) print(date_filename) return link_queue, link_visited if __name__ == '__main__': while True: link_queue = QueueManager.list_init() link_visited = isVisited.init() for iter_count in tqdm(range(MAX_ITERATION)): link_queue, link_visited\ = parse_an_article(link_queue, link_visited) print("now start saving") QueueManager.list_save(link_queue) isVisited.save(link_visited) print("saving complete") while True: user_input = input("continue?\n(y/n)") if user_input == 'y': break if user_input == 'n': exit(0)