def _spider_book_info(url, letter): try: html = getHttp(url, handleException = False) soup = BeautifulSoup() soup.feed(html) h1 = soup.first("h1") if h1 is None: return None assert h1 is not None title = retrieveContents(h1).decode("iso-8859-1") subtitle = None author = None code = None labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})] data = soup.fetch("span", {"class": "title-data"}) try: index = labels.index("Subtitle") subtitle = retrieveContents(data[index]).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Author") author = retrieveContents(data[index].first("a")).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Language") href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"]) code = href[19:href.find("&", 19)].decode("iso-8859-1") except ValueError: pass tid = soup.first("input", {"type": "hidden", "name": "tid"}) assert tid is not None book_id = tid["value"].decode("iso-8859-1") print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore") sel = soup.first("select", {"name": "book"}) assert sel is not None opts = sel.fetch("option") formats = [] for opt in opts: try: format = retrieveContents(opt).split()[0] if format not in ebooks.FORMATS: continue val = opt["value"] formats.append((format, val)) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) formats.sort() return (url, title, subtitle, author, book_id, code, formats)
def retrieveSwitchboardBusiness(name,cityOrZip,state,surrounding,categoryOrName): url = "" zip = False if cityOrZip.isdigit() and len(cityOrZip) == 5: zip = True if categoryOrName == "Name": if zip: url = switchboardServerUrlBusinessSearchZip % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state)) else: url = switchboardServerUrlBusinessSearch % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state)) if categoryOrName == "Category": if zip: url = switchboardServerUrlBusinessSearchCategoryZip % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state)) else: url = switchboardServerUrlBusinessSearchCategory % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state)) # using cached for testing #htmlText = getHttpCached(url) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = parseSwitchboardBusiness(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Business-Search", name+","+cityOrZip+","+state+","+surrounding+","+categoryOrName, htmlText, url) return res, data
def _retrieve_pacific(): global _g_pacific_url htmlText = getHttp(_g_pacific_url, retryCount=3) if htmlText is None: return (RETRIEVE_FAILED, None) return _parse_currency(htmlText, currency_pacific.parseCurrencyData, _g_pacific_url)
def _retrieve_yp_zipCodeByCity(city, state): url = "http://yp.whitepages.com/search/Find_Zip?city_zip=%s&state_id=%s" url = url % (urllib.quote(city), urllib.quote(state)) htmlText = getHttp(url, retryCount=3) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411.ZIPCodeByCity(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Zip-By-City", city+","+state, htmlText, url) return res, data
def _retrieve_411_international(code): url = "http://www.411.com/search/Find_Intl_Code?country_id=%s" url = url % code htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411_by411.internationalCodeSearch(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-International-Code", code, htmlText, url) return res, data
def _retrieve_yp_reversePhone(xxx,yyy,zzzz): url = wy_com_address+"/wp-p-results.php?npa=%s&np3=%s&np4=%s&client=1482&ver=1.2&type=p&phone=%s%s" url = url % (xxx,yyy,zzzz,yyy,zzzz) htmlText = getHttp(url, retryCount=3) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411.reversePhoneLookup(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Reverse-Phone", xxx+"-"+yyy+"-"+zzzz, htmlText, url) return res, data
def _retrieve_yp_person(firstName,lastName,cityOrZip,state): url = wy_com_address+"/white-pages-results.php?f=%s&firstname_begins_with=1&l=%s&name_begins_with=1&c=%s&s=%s&client=&ver=1.4&type=r" url = url % (urllib.quote(firstName),urllib.quote(lastName),urllib.quote(cityOrZip),urllib.quote(state)) htmlText = getHttp(url, retryCount=3) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411.personSearch(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Person-Search", firstName+";"+lastName+";"+cityOrZip+";"+state, htmlText, url) return res, data
def _retrieve_411_areaCodeByCity(city, state): url = "http://www.411.com/search/Find_Areacode?city=%s&state_id=%s" url = url % (urllib.quote(city), urllib.quote(state)) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411_by411.areaCodeByCity(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Area-Code-By-City", city+","+state, htmlText, url) return res, data
def _retrieve_411_reverseZipCode(code): url = "http://www.411.com/search/Reverse_Zip?zip=%s" url = url % code htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411_by411.reverseZIPCodeLookup(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Reverse-Zip", code, htmlText, url) return res, data
def _retrieve_411_reverseAreaCode(code): url = "http://www.411.com/log_feature/sort/search/Reverse_Areacode?npa=%s&sort=alpha" url = url % code htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411_by411.reverseAreaCodeLookup(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Reverse-Area-Code", code, htmlText, url) return res, data
def _retrieve_whitepages_reversePhone(xxx,yyy,zzzz): url = "http://yp.whitepages.com/1048/search/Reverse_Phone?phone=%s%s%s" url = url % (xxx,yyy,zzzz) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411.reversePhoneLookupWhitepages(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Reverse-Phone", xxx+"-"+yyy+"-"+zzzz, htmlText, url) return res, data
def _retrieve_411_person(firstName,lastName,cityOrZip,state): url = "http://www.411.com/search/Find_Person?firstname_begins_with=1&firstname=%s&name_begins_with=1&name=%s&city_zip=%s&state_id=%s" url = url % (urllib.quote(firstName),urllib.quote(lastName),urllib.quote(cityOrZip),urllib.quote(state)) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411_by411.personSearch(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Person-Search", firstName+";"+lastName+";"+cityOrZip+";"+state, htmlText, url) return res, data
def retrieve411ReversePhone(xxx, yyy, zzzz): url = "http://www.411.com/search/Reverse_Phone?phone=%s-%s-%s" % (xxx, yyy, zzzz) # using cached for testing # htmlText = getHttpCached(url) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = parse411ReversePhoneLookup(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Reverse-Phone", xxx + "-" + yyy + "-" + zzzz, htmlText, url) return res, data
def retrieve411ReversePhone(xxx, yyy, zzzz): url = "http://www.411.com/search/Reverse_Phone?phone=%s-%s-%s" % (xxx,yyy,zzzz) # using cached for testing #htmlText = getHttpCached(url) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = parse411ReversePhoneLookup(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Reverse-Phone", xxx + "-" + yyy + "-" + zzzz, htmlText, url) return res, data
def _retrieve_exchangerate(): global _g_exchangerate_urls currencies = dict() for url in _g_exchangerate_urls: htmlText = getHttp(url, retryCount=3) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = _parse_currency(htmlText, currency_exchangerate.parseCurrencyData, url) if RESULTS_DATA != res: return res, data currencies.update(data) return (RESULTS_DATA, currencies)
def retrieveYpReverseAreaCode(zipCode): url = "http://yp.whitepages.com/log_feature/sort/search/Reverse_Areacode?npa=%s&sort=alpha" % zipCode print "retrieveYpReverseAreaCode" #use cached for testing #htmlText = getHttpCached(url, retryCount=3) htmlText = getHttp(url, retryCount=3) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = parseYpReverseAreaCode(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Reverse-Area-Code", zipCode, htmlText, url) return res, data
def retrieveBusinessSearchByUrl(urlIn): res = RETRIEVE_FAILED data = None # witch server? type = "?" url = "" if urlIn.startswith("yplist.php"): url = wy_com_address+"/%s" % urlIn type = "yp" elif urlIn.startswith("/servlet"): url = "http://www.dexonline.com%s" % urlIn type = "dex" elif urlIn.startswith("http://www.switchboard.com"): url = urlIn type = "switch" # retrieve htmlText = None if type == "yp": htmlText = getHttp(url, retryCount=3) elif type == "dex" or type == "switch": htmlText = getHttp(url, retryCount=3) # no? if htmlText is None: return (RETRIEVE_FAILED, None) # parse if type == "yp": res, data = m411.businessSearch(htmlText) elif type == "dex": res, data = m411_by411.businessSearchDex(htmlText) elif type == "switch": res, data = m411_by411.parseSwitchboardBusiness(htmlText) # ending if res == UNKNOWN_FORMAT: logParsingFailure("411-Business-Search-By-Url", urlIn , htmlText, url) return res, data
def retrieveSwitchboardBusiness(name, cityOrZip, state, surrounding, categoryOrName): url = "" zip = False if cityOrZip.isdigit() and len(cityOrZip) == 5: zip = True if categoryOrName == "Name": if zip: url = switchboardServerUrlBusinessSearchZip % ( urllib.quote(name), urllib.quote(cityOrZip), urllib.quote(state), ) else: url = switchboardServerUrlBusinessSearch % ( urllib.quote(name), urllib.quote(cityOrZip), urllib.quote(state), ) if categoryOrName == "Category": if zip: url = switchboardServerUrlBusinessSearchCategoryZip % ( urllib.quote(name), urllib.quote(cityOrZip), urllib.quote(state), ) else: url = switchboardServerUrlBusinessSearchCategory % ( urllib.quote(name), urllib.quote(cityOrZip), urllib.quote(state), ) # using cached for testing # htmlText = getHttpCached(url) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = parseSwitchboardBusiness(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure( "411-Business-Search", name + "," + cityOrZip + "," + state + "," + surrounding + "," + categoryOrName, htmlText, url, ) return res, data
def _retrieve_yp_business(name,cityOrZip,state,surrounding,categoryOrName): url = "" name = name.replace(" ","+") if categoryOrName == "Name": if surrounding == "Yes": url = ypServerUrlBusinessSearchYPsa % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state)) else: url = ypServerUrlBusinessSearch % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state)) if categoryOrName == "Category": if surrounding == "Yes": url = ypServerUrlBusinessSearchCategoryYPsa % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state)) else: url = ypServerUrlBusinessSearchCategory % (urllib.quote(name),urllib.quote(cityOrZip),urllib.quote(state)) htmlText = getHttp(url, retryCount=3) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411.businessSearch(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Business-Search", name+","+cityOrZip+","+state+","+surrounding+","+categoryOrName, htmlText, url) return res, data
def _retrieve_dex_business(name,cityOrZip,state,surrounding,categoryOrName): ## from www.dexonline.com ## no zip accepted: if cityOrZip.isdigit() and len(cityOrZip)==5: log(SEV_EXC, "_retrieve_dex_business doesn't support cityOrZip='%s'" % cityOrZip) return RETRIEVE_FAILED, None url = "" sur = "false" if surrounding == "Yes": sur = "true" if categoryOrName == "Name": url = dexServerUrlBusinessSearch % (urllib.quote(cityOrZip),urllib.quote(state), sur, urllib.quote(name)) elif categoryOrName == "Category": url = dexServerUrlBusinessSearchCategory % (sur, urllib.quote(name), urllib.quote(cityOrZip),urllib.quote(state)) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411_by411.businessSearchDex(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Business-Search", name+","+cityOrZip+","+state+","+surrounding+","+categoryOrName, htmlText, url) return res, data
def _retrieve_moneyextra(): global _g_moneyextra_url htmlText = getHttp(_g_moneyextra_url, retryCount=3) if htmlText is None: return (RETRIEVE_FAILED, None) return _parse_currency(htmlText, currency_moneyextra.parseCurrencyData, _g_moneyextra_url)
def _spider_letter_page(self, letter, page, index): url = _g_manybooks_titles_url % (letter, page) html = getHttp(url.encode("iso-8859-1"), handleException = False) return self._parse_letter_page(letter, html, index)