Python contains_case_insensitive Examples, scriptutil.contains_case_insensitive Python Examples

Example #1

0

Show file

File: drunkduck.py Project: Freestila/dosage

def handle_url(url, session, url_matcher, num_matcher, res):
    """Parse one search result page."""
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = unquote(unescape(match.group(1)))
        path = comicurl[:-1].rsplit('/')[-1]
        name = capfirst(asciify(path))
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        if name in exclude_comics:
            continue
        # find out how many images this comic has
        end = match.end(1)
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        res[name] = (path, num)

Example #2

0

Show file

File: keenspot.py Project: KevinAnthony/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(2)
        name = format_name(match.group(3))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl + "d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = comicurl

Example #3

0

Show file

File: keenspot.py Project: Freestila/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(2)
        name = unescape(match.group(3))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl + "d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = comicurl

Example #4

0

Show file

File: keenspot.py Project: BigYesh/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        mo = descurl_matcher.search(match.group(1))
        desc = get_description(url + mo.group(1), session)
        comicurl = match.group(2)
        name = unescape(match.group(3))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        try:
            if "/d/" not in comicurl:
                check_robotstxt(comicurl+"d/", session)
            else:
                check_robotstxt(comicurl, session)
        except IOError:
            print("INFO: robots.txt denied for keenspot", repr(name))
            continue
        res[name] = (comicurl, desc)

Example #5

0

Show file

def handle_url(url, session, url_matcher, num_matcher, res):
    """Parse one search result page."""
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = unquote(unescape(match.group(1)))
        path = comicurl[:-1].rsplit('/')[-1]
        name = capfirst(asciify(path))
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        if name in exclude_comics:
            continue
        # find out how many images this comic has
        end = match.end(1)
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        res[name] = (path, num)

Example #6

0

Show file

File: comicfury.py Project: Arwarld/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(1)
        name = format_name(comicurl.split('.', 1)[0][7:])
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        # find activity
        mo = activity_matcher.search(data[end:])
        if not mo:
            print("ERROR matching activity:", repr(data[end:end+300]), file=sys.stderr)
            continue
        active = mo.group(1).lower() == "active"
        res[name] = [comicurl, num, active]
    if not res:
        print("ERROR:", "did not match any comics", file=sys.stderr)

Example #7

0

Show file

File: comicfury.py Project: vangroan/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = html.document_fromstring(getPageContent(url, session))
        data.make_links_absolute(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return

    num = 999
    for comicdiv in data.cssselect('div.searchresult'):
        comiclink = comicdiv.cssselect('h3 a')[0]
        comicurl = comiclink.attrib['href']
        name = format_name(comiclink.text)
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name),
                  file=sys.stderr)
            continue

        info = comicdiv.cssselect('span.comicinfo')
        # find out how many images this comic has
        num = int(info[1].text.strip())
        # find activity
        active = info[6].text.strip().lower() == "active"
        lang = info[7].text.strip().lower()
        res[name] = [comicurl, num, active, lang]

    return num

Example #8

0

Show file

File: comicgenesis.py Project: KevinAnthony/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        url = url_overrides.get(name, url)
        try:
            if "/d/" not in url:
                check_robotstxt(url + "d/", session)
            else:
                check_robotstxt(url, session)
        except IOError:
            print("INFO: robots.txt denied for comicgenesis", repr(name))
            continue
        else:
            res[name] = (url, num)

Example #9

0

Show file

File: keenspot.py Project: johna513/dosage

def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        res[name] = (url_overrides.get(name, url), num)

Example #10

0

Show file

File: smackjeeves.py Project: shartge/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in page_matcher.finditer(data):
        page_url = match.group(1)
        page_url = urlparse.urljoin(url, page_url)
        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR matching number:",
                  repr(data[end:end + 300]),
                  file=sys.stderr)
            continue
        num = int(mo.group(1))
        # search for url in extra page
        print("Getting", page_url)
        try:
            data2, baseUrl2 = getPageContent(page_url, session)
        except IOError as msg:
            print("ERROR:", msg, file=sys.stderr)
            return
        mo = url_matcher.search(data2)
        if not mo:
            print("ERROR matching comic URL:",
                  repr(data2[:300]),
                  file=sys.stderr)
            continue
        comic_url = mo.group(1)
        # search for description
        end = mo.end()
        mo = desc_matcher.search(data2[end:])
        if not mo:
            print("ERROR matching comic description:",
                  repr(data2[end:end + 300]),
                  file=sys.stderr)
            continue
        desc = format_description(mo.group(1))
        # search for adult flag
        adult = adult_matcher.search(data2[end:])
        bounce = name not in repeat_comics
        res[name] = [
            url_overrides.get(name, comic_url), num, desc,
            bool(adult), bounce
        ]

Example #11

0

Show file

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(1)
        name = format_name(comicurl.split('.', 1)[0][7:])
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        # find description
        end = match.end()
        mo = desc_matcher.search(data[end:])
        if not mo:
            print("ERROR matching description:",
                  repr(data[end:end + 300]),
                  file=sys.stderr)
            continue
        desc = format_description(mo.group(1))
        # find out how many images this comic has
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR matching number:",
                  repr(data[end:end + 300]),
                  file=sys.stderr)
            continue
        num = int(mo.group(1))
        # find genre
        mo = genre_matcher.search(data[end:])
        if not mo:
            print("ERROR matching genre:",
                  repr(data[end:end + 300]),
                  file=sys.stderr)
            continue
        genre = mo.group(1)
        # find activity
        mo = activity_matcher.search(data[end:])
        if not mo:
            print("ERROR matching activity:",
                  repr(data[end:end + 300]),
                  file=sys.stderr)
            continue
        active = mo.group(1).lower() == "active"
        res[name] = [comicurl, desc, num, genre, active]
    if not res:
        print("ERROR:", "did not match any comics", file=sys.stderr)

Example #12

0

Show file

File: smackjeeves.py Project: johna513/dosage

def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in page_matcher.finditer(data):
        page_url = match.group(1)
        page_url = urlparse.urljoin(url, page_url)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        # search for url in extra page
        print("Getting", page_url)
        try:
            data2, baseUrl2 = getPageContent(page_url)
        except IOError as msg:
            print("ERROR:", msg, file=sys.stderr)
            return
        mo = url_matcher.search(data2)
        if not mo:
            print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
            continue
        comic_url = mo.group(1)
        # search for description
        end = mo.end()
        mo = desc_matcher.search(data2[end:])
        if not mo:
            print("ERROR matching comic description:", repr(data2[end:end+300]), file=sys.stderr)
            continue
        desc = remove_html_tags(mo.group(1))
        desc = unescape(desc)
        desc = unquote(desc)
        desc = compact_whitespace(desc).strip()
        # search for adult flag
        adult = adult_matcher.search(data2[end:])
        bounce = name not in repeat_comics
        res[name] = [
          url_overrides.get(name, comic_url), num, desc, bool(adult), bounce
        ]

Example #13

0

Show file

File: smackjeeves.py Project: Freestila/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in page_matcher.finditer(data):
        page_url = match.group(1)
        page_url = urlparse.urljoin(url, page_url)
        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        # search for url in extra page
        print("Getting", page_url)
        try:
            data2 = getPageContent(page_url, session)
        except IOError as msg:
            print("ERROR:", msg, file=sys.stderr)
            return
        mo = url_matcher.search(data2)
        if not mo:
            print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
            continue
        comic_url = mo.group(1)
        # search for adult flag
        adult = adult_matcher.search(data2[end:])
        bounce = name not in repeat_comics
        res[name] = [
          url_overrides.get(name, comic_url), num, bool(adult), bounce
        ]

Example #14

0

Show file

File: gocomics.py Project: johna513/dosage

def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        shortname = match.group(1)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
            continue
        res[name] = shortname

Example #15

0

Show file

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        shortname = match.group(1)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        res[name] = shortname

Example #16

0

Show file

File: comicgenesis.py Project: shartge/dosage

def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate",
                  repr(name),
                  file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        url = url_overrides.get(name, url)
        try:
            if "/d/" not in url:
                check_robotstxt(url + "d/", session)
            else:
                check_robotstxt(url, session)
        except IOError:
            print("INFO: robots.txt denied for", repr(name))
            continue
        else:
            res[name] = (url, num)