def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = unescape(match.group(3)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, session, url_matcher, num_matcher, res): """Parse one search result page.""" try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = unquote(unescape(match.group(1))) path = comicurl[:-1].rsplit('/')[-1] name = capfirst(asciify(path)) if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue if name in exclude_comics: continue # find out how many images this comic has end = match.end(1) mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (path, num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = unescape(match.group(3)) name = asciify(name.replace("&", "And").replace("@", "At")) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (url_overrides.get(name, url), num)
def handle_url(url, session, url_matcher, num_matcher, res): """Parse one search result page.""" try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = unquote(unescape(match.group(1))) path = comicurl[:-1].rsplit('/')[-1] name = capfirst(asciify(path)) if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue if name in exclude_comics: continue # find out how many images this comic has end = match.end(1) mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (path, num)
def test_names(self): for scraperclass in scraper.get_scraperclasses(): name = scraperclass.getName() self.assertTrue(name.count('/') <= 1, name) if '/' in name: comicname = name.split('/')[1] else: comicname = name self.assertEqual(util.asciify(comicname), comicname)
def test_names(self): for scraperclass in scraper.get_scraperclasses(): name = scraperclass.getName() assert name.count('/') <= 1 if '/' in name: comicname = name.split('/')[1] else: comicname = name assert util.asciify(comicname) == comicname
def test_names(self): for scraperclass in scraper.get_scrapers(): name = scraperclass.get_name() self.assertTrue(name.count('/') <= 1, name) if '/' in name: comicname = name.split('/')[1] else: comicname = name self.assertEqual(util.asciify(comicname), comicname)
def test_names(self): for scraperclass in scraper.get_scraperclasses(): name = scraperclass.getName() self.assertTrue(name.count('/') <= 1, name) if '/' in name: comicname = name.split('/')[1] else: comicname = name self.assertEqual(util.asciify(comicname), comicname) self.assertTrue(isinstance(scraperclass.description, text_type), "Invalid description in %s: %r" % (scraperclass, scraperclass.description))
def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1) page_url = urlparse.urljoin(url, page_url) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) # search for url in extra page print("Getting", page_url) try: data2, baseUrl2 = getPageContent(page_url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return mo = url_matcher.search(data2) if not mo: print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr) continue comic_url = mo.group(1) # search for description end = mo.end() mo = desc_matcher.search(data2[end:]) if not mo: print("ERROR matching comic description:", repr(data2[end:end+300]), file=sys.stderr) continue desc = remove_html_tags(mo.group(1)) desc = unescape(desc) desc = unquote(desc) desc = compact_whitespace(desc).strip() # search for adult flag adult = adult_matcher.search(data2[end:]) bounce = name not in repeat_comics res[name] = [ url_overrides.get(name, comic_url), num, desc, bool(adult), bounce ]
def test_names(self): for scraperclass in scraper.get_scraperclasses(): name = scraperclass.getName() self.assertTrue(name.count('/') <= 1, name) if '/' in name: comicname = name.split('/')[1] else: comicname = name self.assertEqual(util.asciify(comicname), comicname) self.assertTrue( isinstance(scraperclass.description, text_type), "Invalid description in %s: %r" % (scraperclass, scraperclass.description))
def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): shortname = match.group(1) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue res[name] = shortname
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): shortname = match.group(1) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue res[name] = shortname
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for", repr(name)) continue else: res[name] = (url, num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + "/" name = unescape(match.group(2)) name = asciify(name.replace("&", "And").replace("@", "At")) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end : end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for", repr(name)) continue else: res[name] = (url, num)
def format_name(text): """Format a comic name.""" name = unescape(text) name = asciify(name.replace(u"&", u"And").replace(u"@", u"At")) name = capfirst(name) return name
def format_name(text): """Format a comic name.""" name = unescape(text) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) return name
def format_name(text): """Format a comic name.""" name = unescape(text) name = asciify(name.replace(u'&', u'And').replace(u'@', u'At')) name = capfirst(name) return name