def handle_url(url, session, url_matcher, num_matcher, res): """Parse one search result page.""" try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = unquote(unescape(match.group(1))) path = comicurl[:-1].rsplit('/')[-1] name = capfirst(asciify(path)) if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue if name in exclude_comics: continue # find out how many images this comic has end = match.end(1) mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (path, num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = format_name(match.group(3)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = unescape(match.group(3)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): mo = descurl_matcher.search(match.group(1)) desc = get_description(url + mo.group(1), session) comicurl = match.group(2) name = unescape(match.group(3)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl+"d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = (comicurl, desc)
def handle_url(url, session, url_matcher, num_matcher, res): """Parse one search result page.""" try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = unquote(unescape(match.group(1))) path = comicurl[:-1].rsplit('/')[-1] name = capfirst(asciify(path)) if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue if name in exclude_comics: continue # find out how many images this comic has end = match.end(1) mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (path, num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(1) name = format_name(comicurl.split('.', 1)[0][7:]) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) # find activity mo = activity_matcher.search(data[end:]) if not mo: print("ERROR matching activity:", repr(data[end:end+300]), file=sys.stderr) continue active = mo.group(1).lower() == "active" res[name] = [comicurl, num, active] if not res: print("ERROR:", "did not match any comics", file=sys.stderr)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = html.document_fromstring(getPageContent(url, session)) data.make_links_absolute(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return num = 999 for comicdiv in data.cssselect('div.searchresult'): comiclink = comicdiv.cssselect('h3 a')[0] comicurl = comiclink.attrib['href'] name = format_name(comiclink.text) if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue info = comicdiv.cssselect('span.comicinfo') # find out how many images this comic has num = int(info[1].text.strip()) # find activity active = info[6].text.strip().lower() == "active" lang = info[7].text.strip().lower() res[name] = [comicurl, num, active, lang] return num
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for comicgenesis", repr(name)) continue else: res[name] = (url, num)
def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (url_overrides.get(name, url), num)
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1) page_url = urlparse.urljoin(url, page_url) name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) # search for url in extra page print("Getting", page_url) try: data2, baseUrl2 = getPageContent(page_url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return mo = url_matcher.search(data2) if not mo: print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr) continue comic_url = mo.group(1) # search for description end = mo.end() mo = desc_matcher.search(data2[end:]) if not mo: print("ERROR matching comic description:", repr(data2[end:end + 300]), file=sys.stderr) continue desc = format_description(mo.group(1)) # search for adult flag adult = adult_matcher.search(data2[end:]) bounce = name not in repeat_comics res[name] = [ url_overrides.get(name, comic_url), num, desc, bool(adult), bounce ]
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(1) name = format_name(comicurl.split('.', 1)[0][7:]) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find description end = match.end() mo = desc_matcher.search(data[end:]) if not mo: print("ERROR matching description:", repr(data[end:end + 300]), file=sys.stderr) continue desc = format_description(mo.group(1)) # find out how many images this comic has mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) # find genre mo = genre_matcher.search(data[end:]) if not mo: print("ERROR matching genre:", repr(data[end:end + 300]), file=sys.stderr) continue genre = mo.group(1) # find activity mo = activity_matcher.search(data[end:]) if not mo: print("ERROR matching activity:", repr(data[end:end + 300]), file=sys.stderr) continue active = mo.group(1).lower() == "active" res[name] = [comicurl, desc, num, genre, active] if not res: print("ERROR:", "did not match any comics", file=sys.stderr)
def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1) page_url = urlparse.urljoin(url, page_url) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) # search for url in extra page print("Getting", page_url) try: data2, baseUrl2 = getPageContent(page_url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return mo = url_matcher.search(data2) if not mo: print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr) continue comic_url = mo.group(1) # search for description end = mo.end() mo = desc_matcher.search(data2[end:]) if not mo: print("ERROR matching comic description:", repr(data2[end:end+300]), file=sys.stderr) continue desc = remove_html_tags(mo.group(1)) desc = unescape(desc) desc = unquote(desc) desc = compact_whitespace(desc).strip() # search for adult flag adult = adult_matcher.search(data2[end:]) bounce = name not in repeat_comics res[name] = [ url_overrides.get(name, comic_url), num, desc, bool(adult), bounce ]
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1) page_url = urlparse.urljoin(url, page_url) name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) # search for url in extra page print("Getting", page_url) try: data2 = getPageContent(page_url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return mo = url_matcher.search(data2) if not mo: print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr) continue comic_url = mo.group(1) # search for adult flag adult = adult_matcher.search(data2[end:]) bounce = name not in repeat_comics res[name] = [ url_overrides.get(name, comic_url), num, bool(adult), bounce ]
def handle_url(url, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): shortname = match.group(1) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("WARN: skipping possible duplicate", name, file=sys.stderr) continue res[name] = shortname
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): shortname = match.group(1) name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue res[name] = shortname
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = unescape(match.group(2)) name = asciify(name.replace('&', 'And').replace('@', 'At')) name = capfirst(name) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for", repr(name)) continue else: res[name] = (url, num)