def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1) page_url = urlparse.urljoin(url, page_url) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) # search for url in extra page print("Getting", page_url) try: data2, baseUrl2 = getPageContent(page_url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return mo = url_matcher.search(data2) if not mo: print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr) continue comic_url = mo.group(1) # search for description end = mo.end() mo = desc_matcher.search(data2[end:]) if not mo: print("ERROR matching comic description:", repr(data2[end:end+300]), file=sys.stderr) continue desc = remove_html_tags(mo.group(1)) desc = unescape(desc) desc = unquote(desc) desc = compact_whitespace(desc).strip() # search for adult flag adult = adult_matcher.search(data2[end:]) bounce = name not in repeat_comics res[name] = [ url_overrides.get(name, comic_url), num, desc, bool(adult), bounce ]
def format_description(text): """Format a comic description.""" desc = remove_html_tags(text) desc = unescape(desc) desc = unquote(desc) desc = compact_whitespace(desc).strip() return desc
def handle_url(url, session, url_matcher, num_matcher, res): """Parse one search result page.""" try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = unquote(unescape(match.group(1))) path = comicurl[:-1].rsplit('/')[-1] name = capfirst(asciify(path)) if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue if name in exclude_comics: continue # find out how many images this comic has end = match.end(1) mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (path, num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = unescape(match.group(3)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, session, url_matcher, num_matcher, res): """Parse one search result page.""" try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = unquote(unescape(match.group(1))) path = comicurl[:-1].rsplit('/')[-1] name = capfirst(asciify(path)) if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue if name in exclude_comics: continue # find out how many images this comic has end = match.end(1) mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (path, num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = unescape(match.group(3)) name = asciify(name.replace("&", "And").replace("@", "At")) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (url_overrides.get(name, url), num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): shortname = match.group(1) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue res[name] = shortname
def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): shortname = match.group(1) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue res[name] = shortname
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for", repr(name)) continue else: res[name] = (url, num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + "/" name = unescape(match.group(2)) name = asciify(name.replace("&", "And").replace("@", "At")) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end : end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for", repr(name)) continue else: res[name] = (url, num)
def format_name(text): """Format a comic name.""" name = unescape(text) name = asciify(name.replace(u'&', u'And').replace(u'@', u'At')) name = capfirst(name) return name
def test_unescape(self): # Test HTML replacement. self.assertEqual(unescape(u'foo&bar'), u'foo&bar') self.assertEqual(unescape(u'foo bar'), u'foo\xa0bar') self.assertEqual(unescape(u'"foo"'), u'"foo"')
def format_name(text): """Format a comic name.""" name = unescape(text) name = asciify(name.replace(u"&", u"And").replace(u"@", u"At")) name = capfirst(name) return name
def test_unescape(self): # Test HTML replacement. self.assertEqual(unescape('foo&bar'), 'foo&bar') self.assertEqual(unescape('foo bar'), 'foo%C2%A0bar') self.assertEqual(unescape('"foo"'), '%22foo%22')
def format_name(text): """Format a comic name.""" name = unescape(text) name = "".join(capfirst(x) for x in name.split(" ")) name = asciify(name.replace(u'&', u'And').replace(u'@', u'At')) return name
def test_unescape(self): # Test HTML replacement. assert unescape(u'foo&bar') == u'foo&bar' assert unescape(u'foo bar') == u'foo\xa0bar' assert unescape(u'"foo"') == u'"foo"'
def format_name(text): """Format a comic name.""" name = unescape(text) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) return name