def _update_cache(): global _g_currency_cache, _g_currency_cache_lock, _g_retrieve_functions, _g_tracked_currencies, _g_cache_update_interval t = threading.Timer(_g_cache_update_interval, _update_cache) t.start() out = {} tracked = {} tracked.update(_g_tracked_currencies) for func in _g_retrieve_functions: if 0 == len(tracked): break try: res, data = func() if RESULTS_DATA != res: log( SEV_MED, "currency parser: %s returned result: %d" % (str(func), res)) continue for item in data.iteritems(): key, value = item #if key in ["KRW"]: # print "Func: %s" % (str(func)) # print "Key: %s Value: %s " % (key, str(value)) if tracked.has_key(key) and 0 != value: out[key] = value del tracked[key] except Exception, ex: log(SEV_EXC, exceptionAsStr(ex))
def _update_cache(): global _g_currency_cache, _g_currency_cache_lock, _g_retrieve_functions, _g_tracked_currencies, _g_cache_update_interval t = threading.Timer(_g_cache_update_interval, _update_cache) t.start() out = {} tracked = {} tracked.update(_g_tracked_currencies) for func in _g_retrieve_functions: if 0 == len(tracked): break try: res, data = func() if RESULTS_DATA != res: log(SEV_MED, "currency parser: %s returned result: %d" % (str(func), res)) continue for item in data.iteritems(): key, value = item #if key in ["KRW"]: # print "Func: %s" % (str(func)) # print "Key: %s Value: %s " % (key, str(value)) if tracked.has_key(key) and 0 != value: out[key] = value del tracked[key] except Exception, ex: log(SEV_EXC, exceptionAsStr(ex))
def _retrieve_xe(): global _g_xe_url formData = { "basecur": "USD", "historical": "false", "month": "1", "day": "1", "year": "2004", "sort_by": "code", "template": "ict-en" } encFormData = urllib.urlencode(formData) headers = { #"Host": getHostFromUrl(_g_xe_url), "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)", "Referer": _g_xe_url } request = urllib2.Request(_g_xe_url, encFormData, headers) opener = urllib2.build_opener(urllib2.HTTPRedirectHandler()) htmlText = None result = None try: result = opener.open(request) htmlText = result.read() except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (_g_xe_url, txt))
def _spider_book_info(url, letter): try: html = getHttp(url, handleException = False) soup = BeautifulSoup() soup.feed(html) h1 = soup.first("h1") if h1 is None: return None assert h1 is not None title = retrieveContents(h1).decode("iso-8859-1") subtitle = None author = None code = None labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})] data = soup.fetch("span", {"class": "title-data"}) try: index = labels.index("Subtitle") subtitle = retrieveContents(data[index]).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Author") author = retrieveContents(data[index].first("a")).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Language") href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"]) code = href[19:href.find("&", 19)].decode("iso-8859-1") except ValueError: pass tid = soup.first("input", {"type": "hidden", "name": "tid"}) assert tid is not None book_id = tid["value"].decode("iso-8859-1") print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore") sel = soup.first("select", {"name": "book"}) assert sel is not None opts = sel.fetch("option") formats = [] for opt in opts: try: format = retrieveContents(opt).split()[0] if format not in ebooks.FORMATS: continue val = opt["value"] formats.append((format, val)) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) formats.sort() return (url, title, subtitle, author, book_id, code, formats)
def retrieveHttpResponseHandleException(url): #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url) try: status, reason, responseText = retrieveHttpResponse(url) except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt)) return None
def run(self): global g_lupyIndex print "Thread start (dict lupy index)" try: g_lupyIndex.initialize() except Exception, ex: txt = arsutils.exceptionAsStr(ex) log(SEV_EXC, "exception in lupy index dictionary\n%s\n" % (txt))
def retrieveAreaCodeByCity(city, state): global _g_retrieve_areaCodeByCity for func in _g_retrieve_areaCodeByCity: try: res, data = func(city, state) if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrieveBusiness(name,cityOrZip,state,surrounding,categoryOrName): global _g_retrieve_business for func in _g_retrieve_business: try: res, data = func(name,cityOrZip,state,surrounding,categoryOrName) if res not in [RETRIEVE_FAILED, UNKNOWN_FORMAT]: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrievePerson(firstName,lastName,cityOrZip,state): global _g_retrieve_person for func in _g_retrieve_person: try: res, data = func(firstName,lastName,cityOrZip,state) if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def _getHttpHandleExceptionRetry(url, postData, handleRedirect, dbgLevel, referer, retryCount, cookieJar): assert retryCount > 0 while retryCount > 0: try: htmlTxt = _getHttpHelper(url, postData, handleRedirect, dbgLevel, referer, cookieJar) return htmlTxt except socket.error, (err,txt): txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt)) retryCount -= 1
def retrieveInternational(code): global _g_retrieve_international for func in _g_retrieve_international: try: res, data = func(code) if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrieveReversePhone(xxx,yyy,zzzz): global _g_retrieve_reversePhone for func in _g_retrieve_reversePhone: try: res, data = func(xxx,yyy,zzzz) if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def _getHttpHandleExceptionRetry(url, postData, handleRedirect, dbgLevel, referer, retryCount, cookieJar): assert retryCount > 0 while retryCount > 0: try: htmlTxt = _getHttpHelper(url, postData, handleRedirect, dbgLevel, referer, cookieJar) return htmlTxt except socket.error, (err, txt): txt = exceptionAsStr(ex) log( SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt)) retryCount -= 1
def add(self, book_id, file_name): self._lock.acquire() try: cached_name = self._cache.get(book_id, None) if cached_name is not None: if cached_name == file_name: return else: self._cache[book_id] = file_name self._pickle_out() try: os.remove(cached_name) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) else:
def find(self, book_id): self._lock.acquire() file_name = None try: file_name = self._cache.get(book_id, None) if file_name is None: self._lock.release() return None; f = file(file_name, "rb") self._lock.release() return f except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) if file_name is not None: try: os.remove(file_name) except Exception, ex1: log(SEV_EXC, exceptionAsStr(ex1)) pass
def retrieveHttpResponseWithRedirectHandleExceptionRetry(url, retryCount=3): while True: try: #log(SEV_LOW, "retrieveHttpResponseWithRedirectHandleExceptionRetry: %s\n" % url) status, reason, responseText = retrieveHttpResponseWithRedirection( url) except socket.error, (err, txt): retryCount -= 1 #txt = exceptionAsStr(ex) log( SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt)) if retryCount < 0: log( SEV_EXC, "failed to retrieve data for '%s'\ntoo many socket errors\n" % (url)) return None continue # TODO: add handling of urllib2.URLError? # File "C:\Python22\lib\urllib2.py", line 809, in do_open # raise URLError(err) # URLError: <urlopen error (10060, 'Operation timed out')> except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt)) return None
def _retrieve_dex_business(name,cityOrZip,state,surrounding,categoryOrName): ## from www.dexonline.com ## no zip accepted: if cityOrZip.isdigit() and len(cityOrZip)==5: log(SEV_EXC, "_retrieve_dex_business doesn't support cityOrZip='%s'" % cityOrZip) return RETRIEVE_FAILED, None url = "" sur = "false" if surrounding == "Yes": sur = "true" if categoryOrName == "Name": url = dexServerUrlBusinessSearch % (urllib.quote(cityOrZip),urllib.quote(state), sur, urllib.quote(name)) elif categoryOrName == "Category": url = dexServerUrlBusinessSearchCategory % (sur, urllib.quote(name), urllib.quote(cityOrZip),urllib.quote(state)) htmlText = getHttp(url) if htmlText is None: return (RETRIEVE_FAILED, None) res, data = m411_by411.businessSearchDex(htmlText) if res == UNKNOWN_FORMAT: logParsingFailure("411-Business-Search", name+","+cityOrZip+","+state+","+surrounding+","+categoryOrName, htmlText, url) return res, data
def retrieveHttpResponseWithRedirectHandleExceptionRetry(url,retryCount=3): while True: try: #log(SEV_LOW, "retrieveHttpResponseWithRedirectHandleExceptionRetry: %s\n" % url) status, reason, responseText = retrieveHttpResponseWithRedirection(url) except socket.error, (err,txt): retryCount -= 1 #txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt)) if retryCount < 0: log(SEV_EXC, "failed to retrieve data for '%s'\ntoo many socket errors\n" % (url)) return None continue # TODO: add handling of urllib2.URLError? # File "C:\Python22\lib\urllib2.py", line 809, in do_open # raise URLError(err) # URLError: <urlopen error (10060, 'Operation timed out')> except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt)) return None
# get HTTP data from a given url. Return either HTTP data or None if there # was an error during processing def retrieveHttpResponseHandleException(url): #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url) try: status, reason, responseText = retrieveHttpResponse(url) except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt)) return None if 200 != status: log( SEV_EXC, "failed to retrieve data for '%s'\nreason: got %d\n" % (url, status)) return None return responseText # get HTTP data from a given url. Return either HTTP data or None if there # was an error during processing # If there was a socket error (usually 'connection refused' or 'connection reset by peer' # or 'connection timedout'), retry the request retryCount times def retrieveHttpResponseHandleExceptionRetry(url, retryCount=3): while True: try: #log(SEV_LOW, "retrieveHttpResponseHandleExceptionRetry: %s\n" % url) status, reason, responseText = retrieveHttpResponse(url)
finally: conn.close() return status, reason, responseText # get HTTP data from a given url. Return either HTTP data or None if there # was an error during processing def retrieveHttpResponseHandleException(url): #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url) try: status, reason, responseText = retrieveHttpResponse(url) except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt)) return None if 200 != status: log(SEV_EXC, "failed to retrieve data for '%s'\nreason: got %d\n" % (url, status)) return None return responseText # get HTTP data from a given url. Return either HTTP data or None if there # was an error during processing # If there was a socket error (usually 'connection refused' or 'connection reset by peer' # or 'connection timedout'), retry the request retryCount times def retrieveHttpResponseHandleExceptionRetry(url,retryCount=3): while True: try: #log(SEV_LOW, "retrieveHttpResponseHandleExceptionRetry: %s\n" % url) status, reason, responseText = retrieveHttpResponse(url) except socket.error, (err,txt): retryCount -= 1