def _update_cache():
    global _g_currency_cache, _g_currency_cache_lock, _g_retrieve_functions, _g_tracked_currencies, _g_cache_update_interval

    t = threading.Timer(_g_cache_update_interval, _update_cache)
    t.start()

    out = {}
    tracked = {}
    tracked.update(_g_tracked_currencies)
    for func in _g_retrieve_functions:
        if 0 == len(tracked):
            break
        try:
            res, data = func()
            if RESULTS_DATA != res:
                log(
                    SEV_MED, "currency parser: %s returned result: %d" %
                    (str(func), res))
                continue
            for item in data.iteritems():
                key, value = item

                #if key in ["KRW"]:
                #    print "Func: %s" % (str(func))
                #    print "Key: %s   Value: %s " % (key, str(value))
                if tracked.has_key(key) and 0 != value:
                    out[key] = value
                    del tracked[key]
        except Exception, ex:
            log(SEV_EXC, exceptionAsStr(ex))
Beispiel #2
0
def _update_cache():
    global _g_currency_cache, _g_currency_cache_lock, _g_retrieve_functions, _g_tracked_currencies, _g_cache_update_interval

    t = threading.Timer(_g_cache_update_interval, _update_cache)
    t.start()

    out = {}
    tracked = {}
    tracked.update(_g_tracked_currencies)
    for func in _g_retrieve_functions:
        if 0 == len(tracked):
            break
        try:
            res, data = func()
            if RESULTS_DATA != res:
                log(SEV_MED, "currency parser: %s returned result: %d" % (str(func), res))
                continue
            for item in data.iteritems():
                key, value = item

                #if key in ["KRW"]:
                #    print "Func: %s" % (str(func))
                #    print "Key: %s   Value: %s " % (key, str(value))
                if tracked.has_key(key) and 0 != value:
                    out[key] = value
                    del tracked[key]
        except Exception, ex:
            log(SEV_EXC, exceptionAsStr(ex))
Beispiel #3
0
def _retrieve_xe():
    global _g_xe_url
    formData = {
        "basecur": "USD",
        "historical": "false",
        "month": "1",
        "day": "1",
        "year": "2004",
        "sort_by": "code",
        "template": "ict-en"
    }
    encFormData = urllib.urlencode(formData)
    headers = {
        #"Host": getHostFromUrl(_g_xe_url),
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
        "Referer": _g_xe_url
    }
    request = urllib2.Request(_g_xe_url, encFormData, headers)
    opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
    htmlText = None
    result = None
    try:
        result = opener.open(request)
        htmlText = result.read()
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (_g_xe_url, txt))
def _spider_book_info(url, letter):
    try:
        html = getHttp(url, handleException = False)
        soup = BeautifulSoup()
        soup.feed(html)
        h1 = soup.first("h1")
        if h1 is None:
            return None

        assert h1 is not None
        title = retrieveContents(h1).decode("iso-8859-1")

        subtitle = None
        author = None
        code = None

        labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})]
        data = soup.fetch("span", {"class": "title-data"})
        try:
            index = labels.index("Subtitle")
            subtitle = retrieveContents(data[index]).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Author")
            author = retrieveContents(data[index].first("a")).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Language")
            href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"])
            code = href[19:href.find("&", 19)].decode("iso-8859-1")
        except ValueError:
            pass

        tid = soup.first("input", {"type": "hidden", "name": "tid"})
        assert tid is not None
        book_id = tid["value"].decode("iso-8859-1")

        print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore")

        sel = soup.first("select", {"name": "book"})
        assert sel is not None
        opts = sel.fetch("option")
        formats = []
        for opt in opts:
            try:
                format = retrieveContents(opt).split()[0]
                if format not in ebooks.FORMATS:
                    continue

                val = opt["value"]
                formats.append((format, val))

            except Exception, ex:
                log(SEV_EXC, exceptionAsStr(ex))
        formats.sort()
        return (url, title, subtitle, author, book_id, code, formats)
def _retrieve_xe():
    global _g_xe_url
    formData = {
        "basecur": "USD",
        "historical": "false",
        "month": "1",
        "day": "1",
        "year": "2004",
        "sort_by": "code",
        "template": "ict-en"
    }
    encFormData = urllib.urlencode(formData)
    headers = {
        #"Host": getHostFromUrl(_g_xe_url),
        "User-Agent":
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
        "Referer": _g_xe_url
    }
    request = urllib2.Request(_g_xe_url, encFormData, headers)
    opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
    htmlText = None
    result = None
    try:
        result = opener.open(request)
        htmlText = result.read()
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC,
            "failed to retrieve data for '%s'\nreason:%s\n" % (_g_xe_url, txt))
Beispiel #6
0
def retrieveHttpResponseHandleException(url):
    #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url)
    try:
        status, reason, responseText = retrieveHttpResponse(url)
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
        return None
Beispiel #7
0
 def run(self):
     global g_lupyIndex
     print "Thread start (dict lupy index)"
     try:
         g_lupyIndex.initialize()
     except Exception, ex:
         txt = arsutils.exceptionAsStr(ex)
         log(SEV_EXC, "exception in lupy index dictionary\n%s\n" % (txt))
Beispiel #8
0
 def run(self):
     global g_lupyIndex
     print "Thread start (dict lupy index)"
     try:
         g_lupyIndex.initialize()
     except Exception, ex:
         txt = arsutils.exceptionAsStr(ex)
         log(SEV_EXC, "exception in lupy index dictionary\n%s\n" % (txt))
Beispiel #9
0
def retrieveHttpResponseHandleException(url):
    #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url)
    try:
        status, reason, responseText = retrieveHttpResponse(url)
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC,
            "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
        return None
def retrieveAreaCodeByCity(city, state):
    global _g_retrieve_areaCodeByCity
    for func in _g_retrieve_areaCodeByCity:
        try:
            res, data = func(city, state)
            if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrieveBusiness(name,cityOrZip,state,surrounding,categoryOrName):
    global _g_retrieve_business
    for func in _g_retrieve_business:
        try:
            res, data = func(name,cityOrZip,state,surrounding,categoryOrName)
            if res not in [RETRIEVE_FAILED, UNKNOWN_FORMAT]:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrievePerson(firstName,lastName,cityOrZip,state):
    global _g_retrieve_person
    for func in _g_retrieve_person:
        try:
            res, data = func(firstName,lastName,cityOrZip,state)
            if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
Beispiel #13
0
def _getHttpHandleExceptionRetry(url, postData, handleRedirect, dbgLevel, referer, retryCount, cookieJar):
    assert retryCount > 0
    while retryCount > 0:
        try:
            htmlTxt = _getHttpHelper(url, postData, handleRedirect, dbgLevel, referer, cookieJar)
            return htmlTxt
        except socket.error, (err,txt):
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt))
            retryCount -= 1
def retrieveInternational(code):
    global _g_retrieve_international
    for func in _g_retrieve_international:
        try:
            res, data = func(code)
            if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrieveReversePhone(xxx,yyy,zzzz):
    global _g_retrieve_reversePhone
    for func in _g_retrieve_reversePhone:
        try:
            res, data = func(xxx,yyy,zzzz)
            if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
Beispiel #16
0
def _getHttpHandleExceptionRetry(url, postData, handleRedirect, dbgLevel,
                                 referer, retryCount, cookieJar):
    assert retryCount > 0
    while retryCount > 0:
        try:
            htmlTxt = _getHttpHelper(url, postData, handleRedirect, dbgLevel,
                                     referer, cookieJar)
            return htmlTxt
        except socket.error, (err, txt):
            txt = exceptionAsStr(ex)
            log(
                SEV_EXC,
                "failed to retrieve data for '%s'\nsocket error:%d, %s\n" %
                (url, err, txt))
            retryCount -= 1
Beispiel #17
0
 def add(self, book_id, file_name):
     self._lock.acquire()
     try:
         cached_name = self._cache.get(book_id, None)
         if cached_name is not None:
             if cached_name == file_name:
                 return
             else:
                 self._cache[book_id] = file_name
                 self._pickle_out()
                 try:
                     os.remove(cached_name)
                 except Exception, ex:
                     log(SEV_EXC, exceptionAsStr(ex))
         else:
Beispiel #18
0
 def find(self, book_id):
     self._lock.acquire()
     file_name = None
     try:
         file_name = self._cache.get(book_id, None)
         if file_name is None:
             self._lock.release()
             return None;
         f = file(file_name, "rb")
         self._lock.release()
         return f
     except Exception, ex:
         log(SEV_EXC, exceptionAsStr(ex))
         if file_name is not None:
             try:
                 os.remove(file_name)
             except Exception, ex1:
                 log(SEV_EXC, exceptionAsStr(ex1))
                 pass
Beispiel #19
0
def retrieveHttpResponseWithRedirectHandleExceptionRetry(url, retryCount=3):
    while True:
        try:
            #log(SEV_LOW, "retrieveHttpResponseWithRedirectHandleExceptionRetry: %s\n" % url)
            status, reason, responseText = retrieveHttpResponseWithRedirection(
                url)
        except socket.error, (err, txt):
            retryCount -= 1
            #txt = exceptionAsStr(ex)
            log(
                SEV_EXC,
                "failed to retrieve data for '%s'\nsocket error:%d, %s\n" %
                (url, err, txt))
            if retryCount < 0:
                log(
                    SEV_EXC,
                    "failed to retrieve data for '%s'\ntoo many socket errors\n"
                    % (url))
                return None
            continue
        # TODO: add handling of urllib2.URLError?
        #   File "C:\Python22\lib\urllib2.py", line 809, in do_open
        # raise URLError(err)
        # URLError: <urlopen error (10060, 'Operation timed out')>
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC,
                "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
            return None
def _retrieve_dex_business(name,cityOrZip,state,surrounding,categoryOrName):
    ## from www.dexonline.com
    ## no zip accepted:
    if cityOrZip.isdigit() and len(cityOrZip)==5:
        log(SEV_EXC, "_retrieve_dex_business doesn't support cityOrZip='%s'" % cityOrZip)
        return RETRIEVE_FAILED, None    
    url = ""
    sur = "false"
    if surrounding == "Yes":
        sur = "true"
    
    if categoryOrName == "Name":
        url = dexServerUrlBusinessSearch % (urllib.quote(cityOrZip),urllib.quote(state), sur, urllib.quote(name))
    elif categoryOrName == "Category":
        url = dexServerUrlBusinessSearchCategory % (sur, urllib.quote(name), urllib.quote(cityOrZip),urllib.quote(state))

    htmlText = getHttp(url)
    if htmlText is None:
        return (RETRIEVE_FAILED, None)
    res, data = m411_by411.businessSearchDex(htmlText)
    if res == UNKNOWN_FORMAT:
        logParsingFailure("411-Business-Search", name+","+cityOrZip+","+state+","+surrounding+","+categoryOrName, htmlText, url)
    return res, data
Beispiel #21
0
def retrieveHttpResponseWithRedirectHandleExceptionRetry(url,retryCount=3):
    while True:
        try:
            #log(SEV_LOW, "retrieveHttpResponseWithRedirectHandleExceptionRetry: %s\n" % url)
            status, reason, responseText = retrieveHttpResponseWithRedirection(url)
        except socket.error, (err,txt):
            retryCount -= 1
            #txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt))
            if retryCount < 0:
                log(SEV_EXC, "failed to retrieve data for '%s'\ntoo many socket errors\n" % (url))
                return None
            continue
        # TODO: add handling of urllib2.URLError?
        #   File "C:\Python22\lib\urllib2.py", line 809, in do_open
        # raise URLError(err)
        # URLError: <urlopen error (10060, 'Operation timed out')>
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
            return None
Beispiel #22
0

# get HTTP data from a given url. Return either HTTP data or None if there
# was an error during processing
def retrieveHttpResponseHandleException(url):
    #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url)
    try:
        status, reason, responseText = retrieveHttpResponse(url)
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC,
            "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
        return None
    if 200 != status:
        log(
            SEV_EXC, "failed to retrieve data for '%s'\nreason: got %d\n" %
            (url, status))
        return None

    return responseText


# get HTTP data from a given url. Return either HTTP data or None if there
# was an error during processing
# If there was a socket error (usually 'connection refused' or 'connection reset by peer'
# or 'connection timedout'), retry the request retryCount times
def retrieveHttpResponseHandleExceptionRetry(url, retryCount=3):
    while True:
        try:
            #log(SEV_LOW, "retrieveHttpResponseHandleExceptionRetry: %s\n" % url)
            status, reason, responseText = retrieveHttpResponse(url)
Beispiel #23
0
    finally:
        conn.close()
    return status, reason, responseText

# get HTTP data from a given url. Return either HTTP data or None if there
# was an error during processing
def retrieveHttpResponseHandleException(url):
    #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url)
    try:
        status, reason, responseText = retrieveHttpResponse(url)
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
        return None
    if 200 != status:
        log(SEV_EXC, "failed to retrieve data for '%s'\nreason: got %d\n" % (url, status))
        return None

    return responseText

# get HTTP data from a given url. Return either HTTP data or None if there
# was an error during processing
# If there was a socket error (usually 'connection refused' or 'connection reset by peer'
# or 'connection timedout'), retry the request retryCount times
def retrieveHttpResponseHandleExceptionRetry(url,retryCount=3):
    while True:
        try:
            #log(SEV_LOW, "retrieveHttpResponseHandleExceptionRetry: %s\n" % url)
            status, reason, responseText = retrieveHttpResponse(url)
        except socket.error, (err,txt):
            retryCount -= 1