import urllib2 from bs4 import BeautifulSoup import re import time import urlparse from initialremove import remove_files from errorlogging import logerror import path_define #Global variables LINK_ANCHOR={} LINK_FILENAME=[] ERROR_FILE_PATH= path_define.get_ERROR_FILE_PATH() CRAWLEDLISTPATH = path_define.get_CRAWLEDLISTPATH() CRAWLED_HTML_PATH = path_define.get_CRAWLED_HTML_PATH() def get_html_content(url): try: counter= 1; html = urllib2.urlopen(url) content = BeautifulSoup(html,"html.parser") content.prettify() article_id = (url.split("/wiki/")[1]) file_name=re.sub(r'-*_*', '', article_id) if file_name not in LINK_FILENAME: LINK_FILENAME.append(file_name) else: while(file_name in LINK_FILENAME): file_name = file_name+str(counter) counter = counter + 1 LINK_FILENAME.append(file_name)
import os from os.path import exists from glob import glob import path_define CRAWLEDLISTPATH = path_define.get_CRAWLEDLISTPATH() + "\crawled_list.txt" ERROR_FILE_PATH = path_define.get_ERROR_FILE_PATH() CRAWLED_HTML_PATH = path_define.get_CRAWLED_HTML_PATH() def remove_files(): files = glob(CRAWLED_HTML_PATH + '\*.html') for filename in files: os.remove(filename) if exists(CRAWLEDLISTPATH): os.remove(CRAWLEDLISTPATH) if exists(ERROR_FILE_PATH): os.remove(ERROR_FILE_PATH)