def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1) page_url = urlparse.urljoin(url, page_url) name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) # search for url in extra page print("Getting", page_url) try: data2, baseUrl2 = getPageContent(page_url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return mo = url_matcher.search(data2) if not mo: print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr) continue comic_url = mo.group(1) # search for description end = mo.end() mo = desc_matcher.search(data2[end:]) if not mo: print("ERROR matching comic description:", repr(data2[end:end + 300]), file=sys.stderr) continue desc = format_description(mo.group(1)) # search for adult flag adult = adult_matcher.search(data2[end:]) bounce = name not in repeat_comics res[name] = [ url_overrides.get(name, comic_url), num, desc, bool(adult), bounce ]
def write_description(f, out, classname, info): """Add description to class.""" for line in f: out.write(line) if line.startswith('class %s(_BasicScraper):' % classname): description = format_description(info['description']) out.write(u' description = %r\n' % description) return 0
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(1) name = format_name(comicurl.split('.', 1)[0][7:]) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find description end = match.end() mo = desc_matcher.search(data[end:]) if not mo: print("ERROR matching description:", repr(data[end:end + 300]), file=sys.stderr) continue desc = format_description(mo.group(1)) # find out how many images this comic has mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) # find genre mo = genre_matcher.search(data[end:]) if not mo: print("ERROR matching genre:", repr(data[end:end + 300]), file=sys.stderr) continue genre = mo.group(1) # find activity mo = activity_matcher.search(data[end:]) if not mo: print("ERROR matching activity:", repr(data[end:end + 300]), file=sys.stderr) continue active = mo.group(1).lower() == "active" res[name] = [comicurl, desc, num, genre, active] if not res: print("ERROR:", "did not match any comics", file=sys.stderr)
def get_description(url, session): """Get comic strip description.""" try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return u"" mo = desc_matcher.search(data) if not mo: print("ERROR:", repr(data)) return format_description(mo.group(1))
def get_description(url, session): """Get comic strip description.""" try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return "" mo = desc_matcher.search(data) if not mo: print(data) return format_description(mo.group(1))
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1) page_url = urlparse.urljoin(url, page_url) name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) # search for url in extra page print("Getting", page_url) try: data2, baseUrl2 = getPageContent(page_url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return mo = url_matcher.search(data2) if not mo: print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr) continue comic_url = mo.group(1) # search for description end = mo.end() mo = desc_matcher.search(data2[end:]) if not mo: print("ERROR matching comic description:", repr(data2[end:end+300]), file=sys.stderr) continue desc = format_description(mo.group(1)) # search for adult flag adult = adult_matcher.search(data2[end:]) bounce = name not in repeat_comics res[name] = [ url_overrides.get(name, comic_url), num, desc, bool(adult), bounce ]
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(1) name = format_name(comicurl.split('.', 1)[0][7:]) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find description end = match.end() mo = desc_matcher.search(data[end:]) if not mo: print("ERROR matching description:", repr(data[end:end+300]), file=sys.stderr) continue desc = format_description(mo.group(1)) # find out how many images this comic has mo = num_matcher.search(data[end:]) if not mo: print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) continue num = int(mo.group(1)) # find genre mo = genre_matcher.search(data[end:]) if not mo: print("ERROR matching genre:", repr(data[end:end+300]), file=sys.stderr) continue genre = mo.group(1) # find activity mo = activity_matcher.search(data[end:]) if not mo: print("ERROR matching activity:", repr(data[end:end+300]), file=sys.stderr) continue active = mo.group(1).lower() == "active" res[name] = [comicurl, desc, num, genre, active] if not res: print("ERROR:", "did not match any comics", file=sys.stderr)