import urllib.parse import urllib.robotparser as robotparser try: from base import logger_setup from settings import CRAWL_DELAY import sitemap except ImportError: from crawler.base import logger_setup from crawler.settings import CRAWL_DELAY import crawler.sitemap as sitemap logger = logger_setup(__name__) class Txt(robotparser.RobotFileParser): """ Extention of robotparser, adds sitemap functionality, mainly a copy. Additions: - sitemaps - logging """ def __init__(self, url, base_url): self.base_url = base_url self.sitemap = () self.crawl_delay = CRAWL_DELAY
from settings import * import validate import webpage from webpage import remove_file except ImportError: import crawler.base as base_ from crawler.filequeue import Empty import crawler.model as model import crawler.robot as robot from crawler.settings import * import crawler.validate as validate import crawler.webpage as webpage from crawler.webpage import remove_file logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logger = base_.logger_setup(__name__) logger.debug("NEW_CRAWL_RUN | " + dt.now().strftime('%H:%M | %d-%m-%Y |')) ENCODINGS = ['utf_8', 'latin_1', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_7', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp1026', 'cp1125', 'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256', 'cp1257', 'cp1258', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869', 'cp932', 'cp949', 'cp950', 'euc_jis_2004', 'euc_jisx0213', 'euc_jp', 'euc_kr', 'gb18030', 'gb2312', 'gbk', 'hex_codec', 'hp_roman8', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',