class Handler(BaseHTTPRequestHandler): '''Handles HTTP requests, responding to GET requests with the relevent page, or 404, and to POST with application data.''' ROOT = '../html' def __init__(self, *args, **kwargs): self.cache = Cache() self.cache.add_file('../html/index.html') self.cache.add_alias('../html/', '../html/index.html') self.apps = dict() self.apps['/shopping_list'] = shopping_process super().__init__(*args, **kwargs) def do_GET(self): ''' Respond to GET request. A GET request shopuld be asking for a page or other document on the server, it should be a file that exists. If it is, send it back. Otherwise, send a "404: file not found" page. ''' requested_file = self.cache[os.path.join(self.ROOT, self.path[1:])] if requested_file is None: self.file_not_found() else: self.serve_file(requested_file) def do_POST(self): ''' Handle POST request. ''' content_length = int(self.headers['Content-length']) data = json.loads(self.rfile.read(content_length).decode('utf-8')) if self.path in self.apps: self.respond(200, 'application/json', self.apps[self.path](data)) else: self.file_not_found() def file_not_found(self): ''' Returns a 404 page to client if requested file is not found. ''' self.respond( 400, 'text/html', '\n'.join([ '<html>', ' <body>', ' <h1 style="text-align: center;">404</h1>', ' </body>', '</html>' ])) def serve_file(self, cached_file): ''' Convenience func to send file to client. ''' self.respond(200, cached_file.get_mimetype(), cached_file.get_contents()) def respond(self, code, mime, data): ''' Send response to client. ''' self.send_response(code) self.send_header('Content-type', mime) if isinstance(data, str): data = data.encode('utf-8') self.wfile.write(data)
class WikiCrack(object): url = 'https://www.wikipedia.org/wiki/' def __init__(self): self.logger = Logger('.\\logs\\') self.CONF = Config('default.yaml', self.logger).get()['wikicrack'] self.no_attempts = self.CONF['crawler']['max-attempts-download'] self.start_agent = self.CONF['crawler']['agent-name'] self.sleep_for = self.CONF['crawler']['sleep-between'] self.max_accepted = self.CONF['cache']['limit-per-subject'] self.accepted_length = self.CONF['output']['min-accepted-length'] self.word_wrap = self.CONF['output']['word-wrap'] self.decrypt = Decryptor(self.logger) self.cache = Cache(self.CONF, self.logger) self.agent = None def get_valid_user_agent(self): # init the robots.txt parser parser = robotparser.RobotFileParser() parser.set_url(self.url + '/robots.txt') parser.read() # trying to get a valid agent name in less than 10 attempts user_agent = self.start_agent no_hops = 0 while not parser.can_fetch(user_agent, self.url): if user_agent[-1].isdigit(): user_agent = user_agent[:-1] + str(int(user_agent[-1]) + 1) else: user_agent = user_agent + '1' no_hops += 1 # error in finding a valid name if no_hops > 9: return 'default-agent' return user_agent def __download_page(self, url, user_agent): self.logger.log(self.__download_page, __file__, 'Downloading: ' + url + ' ...') page = None req = urllib.request.Request(url) req.add_header('User-agent', user_agent) tries = 0 while tries < self.no_attempts: try: response = urllib.request.urlopen(req) page = response.read().decode('utf-8') break except (URLError, HTTPError, ContentTooShortError) as e: if hasattr(e, 'code'): if not (e.code >= 500 and e.code < 600): return None sleep(self.sleep_for) tries += 1 return page def search_for(self, term): self.logger.log(self.search_for, __file__, "Searching for subject: {}...".format(term)) hits = self.cache.get_file(term) if hits == [] or len(hits) > self.max_accepted: # if there are no cache hits if not hits: self.logger.log(self.search_for, __file__, 'Cache miss!') # check if the search term is too general for our cache if len(hits) > self.max_accepted: self.logger.log(self.search_for, __file__, 'Too many cache hits! Considering it as a wrong result') keywords = term.split(' ') if not self.agent: self.agent = self.get_valid_user_agent() content = self.__download_page(self.url + keywords[0], self.agent) # prepare decryptor and get clean text from it self.decrypt.set_content(content) result = self.decrypt.get_text(wrap=self.word_wrap) # check if the text's length is reasonable if len(result) >= self.accepted_length: # add entry in cache self.cache.add_file(term, result) return result else: # content is too short and it's a great chance that we're on the # 'may also refer to' page self.logger.log(self.search_for, __file__, 'Content too short! Considering the search operation a failure.') return "" else: self.logger.log(self.search_for, __file__, 'Cache hit! Extracting from cache...') with open(hits[0], 'rt') as file: return file.read()