Example #1
0
def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(2)
        name = format_name(match.group(3))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl + "d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = comicurl
Example #2
0
def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        url = url_overrides.get(name, url)
        try:
            if "/d/" not in url:
                check_robotstxt(url + "d/", session)
            else:
                check_robotstxt(url, session)
        except IOError:
            print("INFO: robots.txt denied for comicgenesis", repr(name))
            continue
        else:
            res[name] = (url, num)
Example #3
0
 def get_url(self, url, expand=True):
     """Get an HTML page and parse it with LXML."""
     print("Parsing", url, file=sys.stderr)
     try:
         data = html.document_fromstring(get_page(url, self.session).text)
         if expand:
             data.make_links_absolute(url)
         return data
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         raise
Example #4
0
 def get_url(self, url, expand=True):
     """Get an HTML page and parse it with LXML."""
     print("Parsing", url, file=sys.stderr)
     try:
         data = html.document_fromstring(get_page(url, self.session).text)
         if expand:
             data.make_links_absolute(url)
         return data
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         raise
Example #5
0
 def get_url(self, url, expand=True, robot=True):
     """Get an HTML page and parse it with LXML."""
     print("Parsing", url, file=sys.stderr)
     try:
         pagetext = get_page(url, self.session, robot).text
         data = lxml.html.document_fromstring(pagetext)
         if expand:
             data.make_links_absolute(url)
         if self.sleep > 0:
             time.sleep(self.sleep)
         return data
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         raise
Example #6
0
def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        url = url_overrides.get(name, url)
        try:
            if "/d/" not in url:
                check_robotstxt(url + "d/", session)
            else:
                check_robotstxt(url, session)
        except IOError:
            print("INFO: robots.txt denied for comicgenesis", repr(name))
            continue
        else:
            res[name] = (url, num)