def _spider_book_info(url, letter):
    try:
        html = getHttp(url, handleException = False)
        soup = BeautifulSoup()
        soup.feed(html)
        h1 = soup.first("h1")
        if h1 is None:
            return None

        assert h1 is not None
        title = retrieveContents(h1).decode("iso-8859-1")

        subtitle = None
        author = None
        code = None

        labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})]
        data = soup.fetch("span", {"class": "title-data"})
        try:
            index = labels.index("Subtitle")
            subtitle = retrieveContents(data[index]).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Author")
            author = retrieveContents(data[index].first("a")).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Language")
            href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"])
            code = href[19:href.find("&", 19)].decode("iso-8859-1")
        except ValueError:
            pass

        tid = soup.first("input", {"type": "hidden", "name": "tid"})
        assert tid is not None
        book_id = tid["value"].decode("iso-8859-1")

        print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore")

        sel = soup.first("select", {"name": "book"})
        assert sel is not None
        opts = sel.fetch("option")
        formats = []
        for opt in opts:
            try:
                format = retrieveContents(opt).split()[0]
                if format not in ebooks.FORMATS:
                    continue

                val = opt["value"]
                formats.append((format, val))

            except Exception, ex:
                log(SEV_EXC, exceptionAsStr(ex))
        formats.sort()
        return (url, title, subtitle, author, book_id, code, formats)
Example #2
0
def retrieveSwitchboardBusiness(name,cityOrZip,state,surrounding,categoryOrName):
    url = ""
    zip = False
    if cityOrZip.isdigit() and len(cityOrZip) == 5:
        zip = True
    if categoryOrName == "Name":
        if zip:
            url = switchboardServerUrlBusinessSearchZip % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state))
        else:
            url = switchboardServerUrlBusinessSearch % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state))
    if categoryOrName == "Category":
        if zip:
            url = switchboardServerUrlBusinessSearchCategoryZip % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state))
        else:
            url = switchboardServerUrlBusinessSearchCategory % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state))

    # using cached for testing
    #htmlText = getHttpCached(url)
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = parseSwitchboardBusiness(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Business-Search", name+","+cityOrZip+","+state+","+surrounding+","+categoryOrName, htmlText, url)
    return res, data
def _retrieve_pacific():
    global _g_pacific_url
    htmlText = getHttp(_g_pacific_url, retryCount=3)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    return _parse_currency(htmlText, currency_pacific.parseCurrencyData,
                           _g_pacific_url)
def _retrieve_yp_zipCodeByCity(city, state):
    url = "http://yp.whitepages.com/search/Find_Zip?city_zip=%s&state_id=%s"
    url = url % (urllib.quote(city), urllib.quote(state))
    htmlText = getHttp(url, retryCount=3)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411.ZIPCodeByCity(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Zip-By-City", city+","+state, htmlText, url)
    return res, data    
def _retrieve_411_international(code):
    url = "http://www.411.com/search/Find_Intl_Code?country_id=%s"
    url = url % code
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411_by411.internationalCodeSearch(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-International-Code", code, htmlText, url)
    return res, data    
def _retrieve_yp_reversePhone(xxx,yyy,zzzz):
    url = wy_com_address+"/wp-p-results.php?npa=%s&np3=%s&np4=%s&client=1482&ver=1.2&type=p&phone=%s%s"
    url = url % (xxx,yyy,zzzz,yyy,zzzz)
    htmlText = getHttp(url, retryCount=3)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411.reversePhoneLookup(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Reverse-Phone", xxx+"-"+yyy+"-"+zzzz, htmlText, url)
    return res, data    
def _retrieve_yp_person(firstName,lastName,cityOrZip,state):
    url = wy_com_address+"/white-pages-results.php?f=%s&firstname_begins_with=1&l=%s&name_begins_with=1&c=%s&s=%s&client=&ver=1.4&type=r"
    url = url % (urllib.quote(firstName),urllib.quote(lastName),urllib.quote(cityOrZip),urllib.quote(state))
    htmlText = getHttp(url, retryCount=3)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411.personSearch(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Person-Search", firstName+";"+lastName+";"+cityOrZip+";"+state, htmlText, url)
    return res, data    
def _retrieve_411_areaCodeByCity(city, state):
    url = "http://www.411.com/search/Find_Areacode?city=%s&state_id=%s"
    url = url % (urllib.quote(city), urllib.quote(state))
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411_by411.areaCodeByCity(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Area-Code-By-City", city+","+state, htmlText, url)
    return res, data    
def _retrieve_411_reverseZipCode(code):
    url = "http://www.411.com/search/Reverse_Zip?zip=%s"
    url = url % code
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411_by411.reverseZIPCodeLookup(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Reverse-Zip", code, htmlText, url)
    return res, data    
Example #10
0
def _retrieve_411_reverseAreaCode(code):
    url = "http://www.411.com/log_feature/sort/search/Reverse_Areacode?npa=%s&sort=alpha"
    url = url % code
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411_by411.reverseAreaCodeLookup(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Reverse-Area-Code", code, htmlText, url)
    return res, data    
Example #11
0
def _retrieve_whitepages_reversePhone(xxx,yyy,zzzz):
    url = "http://yp.whitepages.com/1048/search/Reverse_Phone?phone=%s%s%s"
    url = url % (xxx,yyy,zzzz)
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411.reversePhoneLookupWhitepages(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Reverse-Phone", xxx+"-"+yyy+"-"+zzzz, htmlText, url)
    return res, data    
Example #12
0
def _retrieve_411_person(firstName,lastName,cityOrZip,state):
    url = "http://www.411.com/search/Find_Person?firstname_begins_with=1&firstname=%s&name_begins_with=1&name=%s&city_zip=%s&state_id=%s"
    url = url % (urllib.quote(firstName),urllib.quote(lastName),urllib.quote(cityOrZip),urllib.quote(state))
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411_by411.personSearch(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Person-Search", firstName+";"+lastName+";"+cityOrZip+";"+state, htmlText, url)
    return res, data
Example #13
0
def retrieve411ReversePhone(xxx, yyy, zzzz):
    url = "http://www.411.com/search/Reverse_Phone?phone=%s-%s-%s" % (xxx, yyy, zzzz)
    # using cached for testing
    # htmlText = getHttpCached(url)
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = parse411ReversePhoneLookup(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Reverse-Phone", xxx + "-" + yyy + "-" + zzzz, htmlText, url)
    return res, data
Example #14
0
def retrieve411ReversePhone(xxx, yyy, zzzz):
    url = "http://www.411.com/search/Reverse_Phone?phone=%s-%s-%s" % (xxx,yyy,zzzz)
    # using cached for testing
    #htmlText = getHttpCached(url)
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = parse411ReversePhoneLookup(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Reverse-Phone", xxx + "-" + yyy + "-" + zzzz, htmlText, url)
    return res, data    
Example #15
0
def _retrieve_exchangerate():
    global _g_exchangerate_urls
    currencies = dict()
    for url in _g_exchangerate_urls:
        htmlText = getHttp(url, retryCount=3)
        if htmlText is None:
            return (RETRIEVE_FAILED, None)
        res, data =  _parse_currency(htmlText, currency_exchangerate.parseCurrencyData, url)
        if RESULTS_DATA != res:
            return res, data
        currencies.update(data)
    return (RESULTS_DATA, currencies)
Example #16
0
def retrieveYpReverseAreaCode(zipCode):
    url = "http://yp.whitepages.com/log_feature/sort/search/Reverse_Areacode?npa=%s&sort=alpha" % zipCode
    print "retrieveYpReverseAreaCode"
    #use cached for testing
    #htmlText = getHttpCached(url, retryCount=3)
    htmlText = getHttp(url, retryCount=3)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = parseYpReverseAreaCode(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Reverse-Area-Code", zipCode, htmlText, url)
    return res, data    
Example #17
0
def retrieveBusinessSearchByUrl(urlIn):
    res = RETRIEVE_FAILED
    data = None
    # witch server?
    type = "?"
    url = ""
    if urlIn.startswith("yplist.php"):
        url = wy_com_address+"/%s" % urlIn
        type = "yp"
    elif urlIn.startswith("/servlet"):
        url = "http://www.dexonline.com%s" % urlIn
        type = "dex"
    elif urlIn.startswith("http://www.switchboard.com"):
        url = urlIn
        type = "switch"

    # retrieve
    htmlText = None
    if type == "yp":
        htmlText = getHttp(url, retryCount=3)
    elif type == "dex" or type == "switch":
        htmlText = getHttp(url, retryCount=3)

    # no?
    if htmlText is None:
        return (RETRIEVE_FAILED, None)

    # parse
    if type == "yp":
        res, data = m411.businessSearch(htmlText)
    elif type == "dex":
        res, data = m411_by411.businessSearchDex(htmlText)
    elif type == "switch":
        res, data = m411_by411.parseSwitchboardBusiness(htmlText)

    # ending
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Business-Search-By-Url", urlIn , htmlText, url)
    return res, data
def _retrieve_exchangerate():
    global _g_exchangerate_urls
    currencies = dict()
    for url in _g_exchangerate_urls:
        htmlText = getHttp(url, retryCount=3)
        if htmlText is None:
            return (RETRIEVE_FAILED, None)
        res, data = _parse_currency(htmlText,
                                    currency_exchangerate.parseCurrencyData,
                                    url)
        if RESULTS_DATA != res:
            return res, data
        currencies.update(data)
    return (RESULTS_DATA, currencies)
Example #19
0
def retrieveSwitchboardBusiness(name, cityOrZip, state, surrounding, categoryOrName):
    url = ""
    zip = False
    if cityOrZip.isdigit() and len(cityOrZip) == 5:
        zip = True
    if categoryOrName == "Name":
        if zip:
            url = switchboardServerUrlBusinessSearchZip % (
                urllib.quote(name),
                urllib.quote(cityOrZip),
                urllib.quote(state),
            )
        else:
            url = switchboardServerUrlBusinessSearch % (
                urllib.quote(name),
                urllib.quote(cityOrZip),
                urllib.quote(state),
            )
    if categoryOrName == "Category":
        if zip:
            url = switchboardServerUrlBusinessSearchCategoryZip % (
                urllib.quote(name),
                urllib.quote(cityOrZip),
                urllib.quote(state),
            )
        else:
            url = switchboardServerUrlBusinessSearchCategory % (
                urllib.quote(name),
                urllib.quote(cityOrZip),
                urllib.quote(state),
            )

    # using cached for testing
    # htmlText = getHttpCached(url)
    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = parseSwitchboardBusiness(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure(
            "411-Business-Search",
            name + "," + cityOrZip + "," + state + "," + surrounding + "," + categoryOrName,
            htmlText,
            url,
        )
    return res, data
Example #20
0
def _retrieve_yp_business(name,cityOrZip,state,surrounding,categoryOrName):
    url = ""
    name = name.replace(" ","+")
    if categoryOrName == "Name":
        if surrounding == "Yes":
            url = ypServerUrlBusinessSearchYPsa % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state))
        else:
            url = ypServerUrlBusinessSearch % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state))
    if categoryOrName == "Category":
        if surrounding == "Yes":
            url = ypServerUrlBusinessSearchCategoryYPsa % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state))
        else:
            url = ypServerUrlBusinessSearchCategory % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state))

    htmlText = getHttp(url, retryCount=3)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411.businessSearch(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Business-Search", name+","+cityOrZip+","+state+","+surrounding+","+categoryOrName, htmlText, url)
    return res, data    
Example #21
0
def _retrieve_dex_business(name,cityOrZip,state,surrounding,categoryOrName):
    ## from www.dexonline.com
    ## no zip accepted:
    if cityOrZip.isdigit() and len(cityOrZip)==5:
        log(SEV_EXC, "_retrieve_dex_business doesn't support cityOrZip='%s'" % cityOrZip)
        return RETRIEVE_FAILED, None    
    url = ""
    sur = "false"
    if surrounding == "Yes":
        sur = "true"
    
    if categoryOrName == "Name":
        url = dexServerUrlBusinessSearch % (urllib.quote(cityOrZip),urllib.quote(state), sur, urllib.quote(name))
    elif categoryOrName == "Category":
        url = dexServerUrlBusinessSearchCategory % (sur, urllib.quote(name), urllib.quote(cityOrZip),urllib.quote(state))

    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411_by411.businessSearchDex(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Business-Search", name+","+cityOrZip+","+state+","+surrounding+","+categoryOrName, htmlText, url)
    return res, data
Example #22
0
def _retrieve_moneyextra():
    global _g_moneyextra_url
    htmlText = getHttp(_g_moneyextra_url, retryCount=3)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    return _parse_currency(htmlText, currency_moneyextra.parseCurrencyData, _g_moneyextra_url)
 def _spider_letter_page(self, letter, page, index):
     url = _g_manybooks_titles_url % (letter, page)
     html = getHttp(url.encode("iso-8859-1"), handleException = False)
     return self._parse_letter_page(letter, html, index)