def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = format_name(match.group(3)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): mo = descurl_matcher.search(match.group(1)) desc = get_description(url + mo.group(1), session) comicurl = match.group(2) name = unescape(match.group(3)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl+"d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = (comicurl, desc)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = unescape(match.group(3)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for comicgenesis", repr(name)) continue else: res[name] = (url, num)
def collect_results(self): # Parse the comic list page data = self.get_url('https://www.webtoons.com/en/dailySchedule') for comiclink in data.xpath( '//a[contains(@class, "daily_card_item")]'): comicurl = comiclink.attrib['href'] name = comiclink.xpath( './/div[@class="info"]/p[@class="subj"]')[0].text try: check_robotstxt(comicurl, self.session) except IOError as e: print('[%s] INFO: robots.txt denied: %s' % (name, e)) continue self.add_comic(name, comicurl)
def collect_results(self): """Parse the front page.""" data = self.get_url('http://keenspot.com/') for comiclink in data.xpath('//td[@id]/a'): comicurl = comiclink.attrib['href'] name = comiclink.xpath("string()") try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", self.session) else: check_robotstxt(comicurl, self.session) except IOError as e: print("[%s] INFO: robots.txt denied: %s" % (name, e)) continue self.add_comic(name, comicurl)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for", repr(name)) continue else: res[name] = (url, num)