コード例 #1
0
ファイル: smackjeeves.py プロジェクト: johna513/dosage
def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in page_matcher.finditer(data):
        page_url = match.group(1)
        page_url = urlparse.urljoin(url, page_url)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        # search for url in extra page
        print("Getting", page_url)
        try:
            data2, baseUrl2 = getPageContent(page_url)
        except IOError as msg:
            print("ERROR:", msg, file=sys.stderr)
            return
        mo = url_matcher.search(data2)
        if not mo:
            print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
            continue
        comic_url = mo.group(1)
        # search for description
        end = mo.end()
        mo = desc_matcher.search(data2[end:])
        if not mo:
            print("ERROR matching comic description:", repr(data2[end:end+300]), file=sys.stderr)
            continue
        desc = remove_html_tags(mo.group(1))
        desc = unescape(desc)
        desc = unquote(desc)
        desc = compact_whitespace(desc).strip()
        # search for adult flag
        adult = adult_matcher.search(data2[end:])
        bounce = name not in repeat_comics
        res[name] = [
          url_overrides.get(name, comic_url), num, desc, bool(adult), bounce
        ]
コード例 #2
0
ファイル: scriptutil.py プロジェクト: serenitas50/dosage
def format_description(text):
    """Format a comic description."""
    desc = remove_html_tags(text)
    desc = unescape(desc)
    desc = unquote(desc)
    desc = compact_whitespace(desc).strip()
    return desc
コード例 #3
0
def handle_url(url, session, url_matcher, num_matcher, res):
    """Parse one search result page."""
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = unquote(unescape(match.group(1)))
        path = comicurl[:-1].rsplit('/')[-1]
        name = capfirst(asciify(path))
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        if name in exclude_comics:
            continue
        # find out how many images this comic has
        end = match.end(1)
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        res[name] = (path, num)
コード例 #4
0
ファイル: keenspot.py プロジェクト: Freestila/dosage
def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(2)
        name = unescape(match.group(3))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl + "d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = comicurl
コード例 #5
0
ファイル: drunkduck.py プロジェクト: Freestila/dosage
def handle_url(url, session, url_matcher, num_matcher, res):
    """Parse one search result page."""
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = unquote(unescape(match.group(1)))
        path = comicurl[:-1].rsplit('/')[-1]
        name = capfirst(asciify(path))
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        if name in exclude_comics:
            continue
        # find out how many images this comic has
        end = match.end(1)
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        res[name] = (path, num)
コード例 #6
0
ファイル: keenspot.py プロジェクト: vangroan/dosage
def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(2)
        name = unescape(match.group(3))
        name = asciify(name.replace("&", "And").replace("@", "At"))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl + "d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = comicurl
コード例 #7
0
ファイル: scriptutil.py プロジェクト: dromaludaire/dosage
def format_description(text):
    """Format a comic description."""
    desc = remove_html_tags(text)
    desc = unescape(desc)
    desc = unquote(desc)
    desc = compact_whitespace(desc).strip()
    return desc
コード例 #8
0
ファイル: keenspot.py プロジェクト: johna513/dosage
def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        res[name] = (url_overrides.get(name, url), num)
コード例 #9
0
def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        shortname = match.group(1)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        res[name] = shortname
コード例 #10
0
ファイル: gocomics.py プロジェクト: johna513/dosage
def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        shortname = match.group(1)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
            continue
        res[name] = shortname
コード例 #11
0
ファイル: comicgenesis.py プロジェクト: shartge/dosage
def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        url = url_overrides.get(name, url)
        try:
            if "/d/" not in url:
                check_robotstxt(url + "d/", session)
            else:
                check_robotstxt(url, session)
        except IOError:
            print("INFO: robots.txt denied for", repr(name))
            continue
        else:
            res[name] = (url, num)
コード例 #12
0
ファイル: comicgenesis.py プロジェクト: pataluc/dosage
def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + "/"
        name = unescape(match.group(2))
        name = asciify(name.replace("&", "And").replace("@", "At"))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end : end + 300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        url = url_overrides.get(name, url)
        try:
            if "/d/" not in url:
                check_robotstxt(url + "d/", session)
            else:
                check_robotstxt(url, session)
        except IOError:
            print("INFO: robots.txt denied for", repr(name))
            continue
        else:
            res[name] = (url, num)
コード例 #13
0
ファイル: scriptutil.py プロジェクト: serenitas50/dosage
def format_name(text):
    """Format a comic name."""
    name = unescape(text)
    name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
    name = capfirst(name)
    return name
コード例 #14
0
ファイル: test_util.py プロジェクト: BigYesh/dosage
 def test_unescape(self):
     # Test HTML replacement.
     self.assertEqual(unescape(u'foo&bar'), u'foo&bar')
     self.assertEqual(unescape(u'foo bar'), u'foo\xa0bar')
     self.assertEqual(unescape(u'"foo"'), u'"foo"')
コード例 #15
0
ファイル: scriptutil.py プロジェクト: Arwarld/dosage
def format_name(text):
    """Format a comic name."""
    name = unescape(text)
    name = asciify(name.replace(u"&", u"And").replace(u"@", u"At"))
    name = capfirst(name)
    return name
コード例 #16
0
ファイル: test_util.py プロジェクト: johna513/dosage
 def test_unescape(self):
     # Test HTML replacement.
     self.assertEqual(unescape('foo&bar'), 'foo&bar')
     self.assertEqual(unescape('foo bar'), 'foo%C2%A0bar')
     self.assertEqual(unescape('"foo"'), '%22foo%22')
コード例 #17
0
 def test_unescape(self):
     # Test HTML replacement.
     self.assertEqual(unescape(u'foo&bar'), u'foo&bar')
     self.assertEqual(unescape(u'foo bar'), u'foo\xa0bar')
     self.assertEqual(unescape(u'"foo"'), u'"foo"')
コード例 #18
0
ファイル: scriptutil.py プロジェクト: mostlyuseful/dosage
def format_name(text):
    """Format a comic name."""
    name = unescape(text)
    name = "".join(capfirst(x) for x in name.split(" "))
    name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
    return name
コード例 #19
0
ファイル: scriptutil.py プロジェクト: KevinAnthony/dosage
def format_name(text):
    """Format a comic name."""
    name = unescape(text)
    name = "".join(capfirst(x) for x in name.split(" "))
    name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
    return name
コード例 #20
0
 def test_unescape(self):
     # Test HTML replacement.
     assert unescape(u'foo&bar') == u'foo&bar'
     assert unescape(u'foo bar') == u'foo\xa0bar'
     assert unescape(u'"foo"') == u'"foo"'
コード例 #21
0
ファイル: test_util.py プロジェクト: dickloraine/dosage
 def test_unescape(self):
     # Test HTML replacement.
     assert unescape(u'foo&bar') == u'foo&bar'
     assert unescape(u'foo bar') == u'foo\xa0bar'
     assert unescape(u'"foo"') == u'"foo"'
コード例 #22
0
ファイル: scriptutil.py プロジェクト: dromaludaire/dosage
def format_name(text):
    """Format a comic name."""
    name = unescape(text)
    name = asciify(name.replace('&', 'And').replace('@', 'At'))
    name = capfirst(name)
    return name