def request_handler(self, connection, address): data = connection.recv(BUFFER_SIZE) self.logger.info("\n%s", data) request = Request.create_request(data) response = str(RequestHandler(request, self.document_root).response) connection.sendall(response) connection.close()
def __init__(self): # 初始化爬虫列表 self.spiders = self.__init_spiders() # 初始化输出管理器 self.op_manager = OutPutManager(self.spiders) # 初始化请求处理器 self.request_handler = RequestHandler(self.spiders) # 初始化工作列表 self.jobs = []
def __init__(self, dropbox, logall, mail, maxsize): self.dropbox = dropbox self.logall = logall self.sendmail = mail self.maxsize = maxsize if self.dropbox: assert secrets.PATH_IN_DB != '' self.save_path = secrets.PATH_IN_DB self.file_handler = DropboxSaver(self.save_path, secrets.DROPBOX_TOKEN) else: assert secrets.PATH != '' self.save_path = secrets.PATH self.file_handler = FileSaver(self.save_path) self.req = RequestHandler(secrets.USER, secrets.PASSWORD) self.file_handler.create_folder(CHLOG_FOLDER) self.database = Database(self.file_handler, self.dropbox) self.courses = secrets.COURSES self.removed_label_flag = False self.downloads = [] self.changelog = []
class Crawler: """A crawler for downloading university e-learning content.""" def __init__(self, dropbox, logall, mail, maxsize): self.dropbox = dropbox self.logall = logall self.sendmail = mail self.maxsize = maxsize if self.dropbox: assert secrets.PATH_IN_DB != '' self.save_path = secrets.PATH_IN_DB self.file_handler = DropboxSaver(self.save_path, secrets.DROPBOX_TOKEN) else: assert secrets.PATH != '' self.save_path = secrets.PATH self.file_handler = FileSaver(self.save_path) self.req = RequestHandler(secrets.USER, secrets.PASSWORD) self.file_handler.create_folder(CHLOG_FOLDER) self.database = Database(self.file_handler, self.dropbox) self.courses = secrets.COURSES self.removed_label_flag = False self.downloads = [] self.changelog = [] def __str__(self): if not self.downloads: return 'Files were already up to date.' else: s = 's' if len(self.downloads) != 1 else '' d = 'DROPBOX/' if self.dropbox else '' p = d + self.save_path restr = '{} new file{} downloaded from ILIAS to {}.'.format(str(len(self.downloads)), s, p) return restr def run(self): """Main entry point. Authenticate the client, crawl the courses, persist the results, write a changelog and optionally send a mail with the results. """ # authentication try: response = self.req.login() html_text = response.text except ConnectionError as err: print(err, 'A ConnectionError occurred. Please check your internet connection.', sep='\n') sys.exit(1) # check whether authentication worked; has to be done this way # since HTTP response on failed authentication is 200 - OK. auth_failed_msg = 'Anmeldedaten wurden nicht akzeptiert' if auth_failed_msg in html_text: print('Authorization failed. Please maintain user and password correctly.') sys.exit(1) # crawl courses self.crawl(html_text) # wrap up: close database, write changelog and send mail self.database.close(self.file_handler, self.dropbox) self.write_changelog() if self.sendmail and self.downloads: self.req.send_mail(self, self.downloads) # print download stats clrone = clr.BOLD clrtwo = clr.GREEN if self.downloads else clr.ENDC clrend = clr.ENDC print(clrone, clrtwo, self, clrend, sep='') def crawl(self, html_text): """Loop through top level courses and crawl the content for every course.""" soup_courses = BeautifulSoup(html_text, 'html.parser') for soup_course in soup_courses.findAll('a', {'class': 'il_ContainerItemTitle'}): scs = soup_course.string course_name = util.course_contains(scs, self.courses) relative_link = soup_course.get('href') course_url = 'https://ilias.uni-mannheim.de/' + relative_link if course_name is not None: self.crawl_course(course_url, course_name + '/') else: print(clr.BOLD, 'No download requested for course >> ', clr.ENDC, scs.lstrip(), sep='') def crawl_course(self, course_url, folder_path): """Recursively call this method until there is something to download for this course in the respective path.""" html_text_course = self.req.session.get(course_url).text soup_course = BeautifulSoup(html_text_course, 'html.parser') containers = soup_course.find_all('div', {'class': 'il_ContainerListItem'}) if containers: if not self.removed_label_flag: util.print_method('folder_path', folder_path) for container in containers: file_ending = '' soup_line = container.find('a', {'class': 'il_ContainerItemTitle'}) if soup_line: link = soup_line.get('href') else: continue item_properties = container.find('div', {'class': 'ilListItemSection il_ItemProperties'}) if item_properties is not None: item_prop = item_properties.find_all('span', {'class', 'il_ItemProperty'}) properties = [str(prop.string.strip()) for prop in item_prop if prop.string is not None] if properties: file_ending = properties[0] last_update = properties[2] # 22. May 2019, 14:15 ->2019-05-22 14:15:00 d = datetime.strptime(last_update, '%d. %b %Y, %H:%M') # 201905221415 last_update = d.strftime('%Y%m%d%H%M') if 'download' in link: self.file_handler.create_folder(folder_path) self.check_save(folder_path, soup_line.string, file_ending, last_update, link) else: parsed = util.remove_edge_characters(soup_line.string) if not parsed: self.removed_label_flag = True self.crawl_course('https://ilias.uni-mannheim.de/' + link, folder_path + parsed) else: util.print_method('no_files_in', str(folder_path)) def check_save(self, folder_path, filename, file_ending, last_update, url): """Prepare the file to be saved. Remove edge characters, trim and add the correct file ending.""" # remove edge characters and trim filename = re.sub(r'[&]', 'and', filename) filename = re.sub(r'[!@#$/\:;*?<>|]', '', filename).strip() http = self.req.session.head(url, headers={'Accept-Encoding': 'identity'}) file_size = http.headers['content-length'] if not file_ending: file_ending = str(mimetypes.guess_extension(http.headers['content-type'])) relative_file = folder_path + filename relative_path = relative_file + '.' + file_ending # for printing what is done with that file clrone = clr.ENDC clrtwo = clr.ENDC method = '' messag = relative_path # query db for path and update res_pu = self.database.get_name_update(relative_path, last_update) # example file sizes 2E8: 200.000.000 Bytes; 5E7: 30 MB if float(file_size) >= self.maxsize: # Skip clrone = clr.BLUE method = 'file_skiped' # if db contains entry with path and update, file was already downloaded elif res_pu: # exists method = 'loaded_once' else: # download file to compute hash content = self.req.session.get(url).content # compute content hash content_hash = hashlib.sha1(content).hexdigest() # query db for hash res_h = self.database.get_hash(content_hash) # filename or last update may have changed but hash exists # thus file is known and was already downloaded if res_h: # exists method = 'loaded_once' else: # query db for name res_p = self.database.get_name(relative_path) # if this name already exists in the database # must be an update because otherwise the name + last_update # or the hash should have been in the db already if res_p: method = 'file_update' clrone = clr.GREEN relative_path = '{}_UP{}.{}'.format(relative_file, content_hash[:4], file_ending) messag = relative_path # not an update: new file else: # check if this filename exists already at the destination path # should not happen unless user renamed file to exactly this downloaded file name # check also only exists to inform user that file is not just overwritten # but safely moved to the .overwritten/ folder exists = self.file_handler.exists(relative_path) if not exists: method = 'downloading' clrone = clr.BOLD else: method = 'safe_overwr' clrone = clr.RED messag = relative_path + ' from ' + url saved = self.file_handler.save_file(relative_path, content) if saved: self.database.insert(relative_path, content_hash, last_update) self.downloads.append(method + ': ' + relative_path) if method != ('file_skiped' and 'loaded_once') or self.logall: self.changelog.append(str(method + ': ' + messag)) util.print_method(method, messag, clrone, clrtwo) def write_changelog(self): """Write a changelog to /chosen_dir/.changelog/changelog_{datetime}.""" if not self.changelog: return d = datetime.today().strftime('%Y-%m-%d_%H-%M-%S') tmp = '# Changelog from {}\n'.format(d) tmp += str(len(tmp) * '-') + '\n' tmp += '\n'.join(self.changelog) + '\n' b = tmp.encode('utf-8') self.file_handler.save_file(CHLOG_FOLDER + 'changelog_{}.txt'.format(d), b, True)
class Core(object): def __init__(self): # 初始化爬虫列表 self.spiders = self.__init_spiders() # 初始化输出管理器 self.op_manager = OutPutManager(self.spiders) # 初始化请求处理器 self.request_handler = RequestHandler(self.spiders) # 初始化工作列表 self.jobs = [] def __call__(self): """启动协程""" if getattr(setting, 'INIT_REQUESTS', getattr(defaultsettings, 'INIT_REQUESTS')): self.__init_requests() concurrency = getattr(setting, 'CONCURRENCY', getattr(defaultsettings, 'CONCURRENCY')) self.jobs = [gevent.spawn(self.__roll_request) for i in range(concurrency)] gevent.joinall(self.jobs) def __init_requests(self): for spider in self.spiders: for request in spider.start_requests(): RequestQueue.push(request, spider) @staticmethod def __init_spiders(): """实例化setting中指定的spiders""" def is_module(x): if x.endswith('.py') and not x.startswith('__'): return True spiders = [] for module_file in filter(is_module, os.listdir('spiders')): module = importlib.import_module('spiders.' + '.'.join(module_file.split('.')[:-1])) for attr_name in dir(module): attr = getattr(module, attr_name) if type(attr) == type and issubclass(attr, Spider) and attr.__name__ in tools.get_conf('spiders'): logger.debug('Create instances of spider<%s>' % attr.__name__) spiders.append(attr()) return spiders def __save_mode(self, request, err): """未知异常处理,安全模式下保护进程,调试模式下退出进程""" logger.error('Unexpected error happen when crawling<%s>, reason: %s: %s', request.url, type(err), err) if not tools.get_conf('SAFE_MODE'): gevent.killall(self.jobs) def __throw_request(self, request, spider): """将请求抛给请求处理器(ReuqestHandler)来处理, 当获取响应失败时抛出NoResponseError, 遇到未知类型错误将根据SAVE_MODE来决定是否退出进程""" try: return self.request_handler.handle_request(request, spider) except NoResponseError: raise except Exception as err: self.__save_mode(request, err) time.sleep(60) def __parse_response(self, response, spider): """处理响应,将新的请求放入请求队列,提取到的item抛给输入管理器处理""" try: callback = getattr(spider, response.request.callback) for each in callback(response): if isinstance(each, req.Request): # 将新产生的请求添加至队列 RequestQueue.push(each, spider) elif isinstance(each, dict): # 将提取到的item抛给op_manager进行处理 self.op_manager(each, spider) except Exception as err: self.__save_mode(response.request, err) def __roll_request(self): """不断从请求列表中提取请求进行处理(发送请求,解析响应)""" while 1: try: request = RequestQueue.pop() except EmptyError: # 请求队列为空, 退出程序 gevent.killall(self.jobs) try: # 递出请求 response = self.__throw_request(request, request.spider) except NoResponseError as err: logger.debug('No response, reason: %s', err) if 'duplicate url' in err.message: RequestQueue.del_requesting(request) else: # 解析响应 self.__parse_response(response, request.spider) RequestQueue.del_requesting(request) def start_core(self): """启动核心驱动""" # 增加redis的连接数 redis_conn.setnx('connect_clients', 0) redis_conn.incr('connect_clients') self.init_start_requests() self.jobs = [gevent.spawn(self.roll_request) for i in range(getattr(setting, 'CONCURRENCY', 10))] gevent.joinall(self.jobs)
def __init__(self, config): """Takes a configReader.Config object as an argument.""" self.request = RequestHandler(config) self.config = config
class API: def __init__(self, config): """Takes a configReader.Config object as an argument.""" self.request = RequestHandler(config) self.config = config def getItemsById(self, ids=[]): """Returns an item collection. Takes a list of ids as an argument.""" ids = [str(x) for x in ids] resp = self.request.get({"action":"wbgetentities", "ids": "|".join(ids)}) items = self._createItemCollection(resp["entities"]) return items def getItemById(self, iid): """Returns the item defined by the argument.""" return self.getItemsById([iid])[0] def getItemsByInterwiki(self, arg1=[], arg2=[]): """Returns an item collection, coming from either a list of sites as the first argument and a list of titles as the second argument, or a list of tuples (site, title) as the only argument.""" if arg1 and not arg2: # then arg1 is [[site,title],[site,title]] sites = [x[0] for x in arg1] titles = [x[1] for x in arg1] else: sites = arg1 titles = arg2 resp = self.request.get({"action":"wbgetentities", "sites": "|".join(sites), "titles": "|".join(titles)}) items = self._createItemCollection(resp["entities"]) return items def getItemByInterwiki(self, site, title): """Returns an item which has the requested site and title.""" return self.getItemsByInterwiki([site], [title])[0] def save(self, items, comment=None): """Saves a list of items or a single item, with an optional second parameter being the summary.""" if type(items) != list: items = [items] for item in items: params = {"action":"wbeditentity"} if item.id: params["id"] = "q" + str(item.id) if comment: params["summary"] = comment if self.config["botflag"]: params["bot"] = "1" data = { "sitelinks": item.sitelinks.export(), "aliases": item.aliases.export(), "labels": item.labels.export(), "descriptions": item.descriptions.export() } params["data"] = json.dumps(data, ensure_ascii=False) self.request.postWithToken(params) def _createItemCollection(self, data): items = [] for item in data: i = self._createItem(data[item]) items.append(i) return items def _createItem(self, item, target=None): if not "sitelinks" in item: item["sitelinks"] = {} if not "aliases" in item: item["aliases"] = {} if not "labels" in item: item["labels"] = {} if not "descriptions" in item: item["descriptions"] = {} sitelinks = {} for x in item["sitelinks"]: sitelinks[x] = item["sitelinks"][x]["title"] aliases = {} for x in item["aliases"]: aliases[x] = [y["value"] for y in item["aliases"][x]] labels = {} for x in item["labels"]: labels[x] = item["labels"][x]["value"] descriptions = {} for x in item["descriptions"]: descriptions[x] = item["descriptions"][x]["value"] if target: target.sitelinks = sitelinks target.aliases = aliases target.labels = labels target.descriptions = descriptions if target.id and target.id != item["id"]: raise errors.ItemIDMismatch("Local item id does not match remote id. Have you added manually the id?") else: target.id = item["id"] return target else: i = Item(sitelinks, aliases, labels, descriptions) i.id = int(item["id"].lower().replace("q","")) return i
class API: def __init__(self, config): """Takes a configReader.Config object as an argument.""" self.request = RequestHandler(config) self.config = config def getItemsById(self, ids=[]): """Returns an item collection. Takes a list of ids as an argument.""" ids = [str(x) for x in ids] resp = self.request.get({"action":"wbgetentities", "ids": "|".join(ids)}) items = self._createItemCollection(resp["entities"]) return items def getItemById(self, iid): """Returns the item defined by the argument.""" return self.getItemsById([iid])[0] def getItemsByInterwiki(self, arg1=[], arg2=[]): """Returns an item collection, coming from either a list of sites as the first argument and a list of titles as the second argument, or a list of tuples (site, title) as the only argument.""" if arg1 and not arg2: # then arg1 is [[site,title],[site,title]] sites = [x[0] for x in arg1] titles = [x[1] for x in arg1] else: sites = arg1 titles = arg2 resp = self.request.get({"action":"wbgetentities", "sites": "|".join(sites), "titles": "|".join(titles)}) items = self._createItemCollection(resp["entities"]) return items def getItemByInterwiki(self, site, title): """Returns an item which has the requested site and title.""" return self.getItemsByInterwiki([site], [title])[0] def save(self, items, comment=None): """Saves a list of items or a single item, with an optional second parameter being the summary.""" if type(items) != list: items = [items] for item in items: params = {"action":"wbsetitem"} if item.id: params["id"] = item.id if comment: params["summary"] = comment if self.config["botflag"]: params["bot"] = "1" data = { "sitelinks": item.sitelinks.export(), "aliases": item.aliases.export(), "labels": item.labels.export(), "descriptions": item.descriptions.export() } params["data"] = json.dumps(data, ensure_ascii=False) self.request.postWithToken(params) def _createItemCollection(self, data): items = [] for item in data: i = self._createItem(data[item]) items.append(i) return items def _createItem(self, item, target=None): if not "sitelinks" in item: item["sitelinks"] = {} if not "aliases" in item: item["aliases"] = {} if not "labels" in item: item["labels"] = {} if not "descriptions" in item: item["descriptions"] = {} sitelinks = {} for x in item["sitelinks"]: sitelinks[x] = item["sitelinks"][x]["title"] aliases = {} for x in item["aliases"]: aliases[x] = [y["value"] for y in item["aliases"][x]] labels = {} for x in item["labels"]: labels[x] = item["labels"][x]["value"] descriptions = {} for x in item["descriptions"]: descriptions[x] = item["descriptions"][x]["value"] if target: target.sitelinks = sitelinks target.aliases = aliases target.labels = labels target.descriptions = descriptions if target.id and target.id != item["id"]: raise errors.ItemIDMismatch("Local item id does not match remote id. Have you added manually the id?") else: target.id = item["id"] return target else: i = Item(sitelinks, aliases, labels, descriptions) i.id = int(item["id"].replace("q","")) return i