Ejemplo n.º 1
0
def loadPickledFiles():
    global g_wnWords, g_wnWordIndex, g_wnDictPath, g_wnIndexPath, g_wnWordsPath
    print "loading wn dictionary data files"

    if not arsutils.fFileExists(g_wnDictPath):
        print "WordNet dictionary file '%s' doesn't exist" % g_wnDictPath
        return False

    if not arsutils.fFileExists(g_wnIndexPath):
        print "WordNet dictionary file '%s' doesn't exist" % g_wnIndexPath
        return False

    if not arsutils.fFileExists(g_wnWordsPath):
        print "WordNet dictionary file '%s' doesn't exist" % g_wnWordsPath
        return False

    try:
        fo = open(g_wnIndexPath, "rb")
        g_wnWordIndex = cPickle.load(fo)
        fo.close()

        fo = open(g_wnWordsPath, "rb")
        g_wnWords = cPickle.load(fo)
        fo.close()
    except Exception, ex:
        print arsutils.exceptionAsStr(ex)
        return False
Ejemplo n.º 2
0
    def _merge_temps(self):
        if not os.path.exists(ebooks.g_storage):
            return
        temps = [os.path.join(ebooks.g_storage, name) for name in os.listdir(ebooks.g_storage) if self.temp_file_pattern.match(name)]
        if 0 == len(temps):
            return

        print "Merging temporary segments."
        temps.sort()
        try:
            for temp in temps:
                f = file(temp, "rb")
                try:
                    data = cPickle.load(f)
                finally:
                    f.close()
                    os.remove(temp)

                for letter, index, book in data:
                    letter_data = self._data.get(letter, None)
                    if letter_data is None:
                        letter_data = [0, []]
                        self._data[letter] = letter_data

                    assert index <= letter_data[0]
                    letter_data[1].insert(index, book)
                    letter_data[0] += 1
        except Exception, ex:
            print exceptionAsStr(ex)
Ejemplo n.º 3
0
def loadPickledFiles():
    global g_wnWords, g_wnWordIndex, g_wnDictPath, g_wnIndexPath, g_wnWordsPath
    print "loading wn dictionary data files"

    if not arsutils.fFileExists(g_wnDictPath):
        print "WordNet dictionary file '%s' doesn't exist" % g_wnDictPath
        return False

    if not arsutils.fFileExists(g_wnIndexPath):
        print "WordNet dictionary file '%s' doesn't exist" % g_wnIndexPath
        return False

    if not arsutils.fFileExists(g_wnWordsPath):
        print "WordNet dictionary file '%s' doesn't exist" % g_wnWordsPath
        return False

    try:
        fo = open(g_wnIndexPath, "rb")
        g_wnWordIndex = cPickle.load(fo)
        fo.close()

        fo = open(g_wnWordsPath, "rb")
        g_wnWords = cPickle.load(fo)
        fo.close()
    except Exception, ex:
        print arsutils.exceptionAsStr(ex)
        return False
Ejemplo n.º 4
0
 def run(self):
     try:
         self.count = self._spider._spider_letter_range(self._letters)
     except _StopSpidering:
         return
     except Exception, ex:
         print exceptionAsStr(ex)
         self._spider._finish.set()
Ejemplo n.º 5
0
def retrieveRequests():
    global g_lastRequestLogId, g_dailyStats, g_modifiedDays
    cursor = None
    conn = getConnection()
    try:
        if None == g_lastRequestLogId:
            sql = "SELECT request_id, user_id, DATE_FORMAT(log_date, '%Y-%m-%d'), free_p, request, result, error FROM request_log ORDER BY request_id;"
        else:
            sql = "SELECT request_id, user_id, DATE_FORMAT(log_date, '%Y-%m-%d'), free_p, request, result, error FROM request_log WHERE request_id > " + str(
                g_lastRequestLogId) + " ORDER BY request_id;"

        cursor = conn.cursor()
        cursor.execute(sql)

        processed = 0
        prev_id = -1
        while True:
            row = cursor.fetchone()
            if None == row:
                break
            reqData = RequestData()
            reqData.request_id = row[0]
            reqData.user_id = row[1]
            reqData.log_date = row[2]
            reqData.free_p = row[3]
            reqData.request = row[4]
            reqData.result = row[5]
            reqData.error = row[
                6]  # it's either a number or None if there was no error

            assert reqData.request_id > prev_id
            prev_id = reqData.request_id

            logDate = reqData.log_date
            if g_dailyStats.has_key(logDate):
                g_dailyStats[logDate].append(reqData)
            else:
                g_dailyStats[logDate] = [reqData]

            if not g_modifiedDays.has_key(logDate):
                g_modifiedDays[logDate] = 1

            if reqData.request_id > g_lastRequestLogId:
                g_lastRequestLogId = reqData.request_id
            processed += 1
        cursor.close()

        # print "processed %d requests" % processed
    except _mysql_exceptions.Error, ex:
        if cursor:
            cursor.close()
        #log(SEV_HI, arsutils.exceptionAsStr(ex))
        print "exception in retrieveRequests()"
        print arsutils.exceptionAsStr(ex)
Ejemplo n.º 6
0
def retrieveRequests():
    global g_lastRequestLogId, g_dailyStats, g_modifiedDays
    cursor = None
    conn = getConnection()
    try:
        if None == g_lastRequestLogId:
            sql = "SELECT request_id, user_id, DATE_FORMAT(log_date, '%Y-%m-%d'), free_p, request, result, error FROM request_log ORDER BY request_id;"
        else:
            sql = "SELECT request_id, user_id, DATE_FORMAT(log_date, '%Y-%m-%d'), free_p, request, result, error FROM request_log WHERE request_id > " + str(g_lastRequestLogId) + " ORDER BY request_id;"

        cursor = conn.cursor()
        cursor.execute(sql)

        processed = 0
        prev_id = -1
        while True:
            row = cursor.fetchone()
            if None == row:
                break
            reqData = RequestData()
            reqData.request_id = row[0]
            reqData.user_id = row[1]
            reqData.log_date = row[2]
            reqData.free_p = row[3]
            reqData.request = row[4]
            reqData.result = row[5]
            reqData.error = row[6]  # it's either a number or None if there was no error

            assert reqData.request_id > prev_id 
            prev_id = reqData.request_id
            
            logDate = reqData.log_date
            if g_dailyStats.has_key(logDate):
                g_dailyStats[logDate].append(reqData)
            else:
                g_dailyStats[logDate] = [reqData]

            if not g_modifiedDays.has_key(logDate):
                g_modifiedDays[logDate] = 1

            if reqData.request_id > g_lastRequestLogId:
                g_lastRequestLogId = reqData.request_id
            processed += 1
        cursor.close()

        # print "processed %d requests" % processed
    except _mysql_exceptions.Error, ex:
        if cursor:
            cursor.close()
        #log(SEV_HI, arsutils.exceptionAsStr(ex))
        print "exception in retrieveRequests()"
        print arsutils.exceptionAsStr(ex)
Ejemplo n.º 7
0
def retrieveUsers():
    global g_lastUserId, g_userStats
    cursor = None
    conn = getConnection()
    try:
        sql = "SELECT user_id, device_info, DATE_FORMAT(cookie_issue_date, '%Y-%m-%d'), reg_code, DATE_FORMAT(registration_date, '%Y-%m-%d'), disabled_p FROM users ORDER BY user_id;"
        #sql = "SELECT user_id, device_info, DATE_FORMAT(cookie_issue_date, '%Y-%m-%d'), reg_code, DATE_FORMAT(registration_date, '%Y-%m-%d'), disabled_p FROM users WHERE user_id > " + str(g_lastUserId) + "ORDER BY user_id;";

        cursor = conn.cursor()
        cursor.execute(sql)

        processed = 0
        prev_id = -1
        while True:
            row = cursor.fetchone()
            if None == row:
                break
            userData = UserData()
            userData.user_id = row[0]
            userData.device_info = row[1]
            userData.cookie_issue_date = row[2]
            userData.reg_code = row[3]
            userData.registration_date = row[4]
            userData.disabled_p = row[5]
            if None == userData.registration_date:
                userData.fRegistered = False
            else:
                userData.fRegistered = True

            if userData.user_id > g_lastUserId:
                userData.new_user_p = True
            else:
                userData.new_user_p = False

            assert userData.user_id > prev_id
            prev_id = userData.user_id

            g_userStats.append(userData)
            if userData.user_id > g_lastUserId:
                g_lastUserId = userData.user_id
            processed += 1
        cursor.close()

        # print "processed %d requests" % processed
    except _mysql_exceptions.Error, ex:
        if cursor:
            cursor.close()
        #log(SEV_HI, arsutils.exceptionAsStr(ex))
        print "exception in retrieveUsers()"
        print arsutils.exceptionAsStr(ex)
def _retrieve_xe():
    global _g_xe_url
    formData = {
        "basecur": "USD",
        "historical": "false",
        "month": "1",
        "day": "1",
        "year": "2004",
        "sort_by": "code",
        "template": "ict-en"
    }
    encFormData = urllib.urlencode(formData)
    headers = {
        #"Host": getHostFromUrl(_g_xe_url),
        "User-Agent":
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
        "Referer": _g_xe_url
    }
    request = urllib2.Request(_g_xe_url, encFormData, headers)
    opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
    htmlText = None
    result = None
    try:
        result = opener.open(request)
        htmlText = result.read()
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC,
            "failed to retrieve data for '%s'\nreason:%s\n" % (_g_xe_url, txt))
Ejemplo n.º 9
0
def _spider_book_info(url, letter):
    try:
        html = getHttp(url, handleException = False)
        soup = BeautifulSoup()
        soup.feed(html)
        h1 = soup.first("h1")
        if h1 is None:
            return None

        assert h1 is not None
        title = retrieveContents(h1).decode("iso-8859-1")

        subtitle = None
        author = None
        code = None

        labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})]
        data = soup.fetch("span", {"class": "title-data"})
        try:
            index = labels.index("Subtitle")
            subtitle = retrieveContents(data[index]).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Author")
            author = retrieveContents(data[index].first("a")).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Language")
            href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"])
            code = href[19:href.find("&", 19)].decode("iso-8859-1")
        except ValueError:
            pass

        tid = soup.first("input", {"type": "hidden", "name": "tid"})
        assert tid is not None
        book_id = tid["value"].decode("iso-8859-1")

        print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore")

        sel = soup.first("select", {"name": "book"})
        assert sel is not None
        opts = sel.fetch("option")
        formats = []
        for opt in opts:
            try:
                format = retrieveContents(opt).split()[0]
                if format not in ebooks.FORMATS:
                    continue

                val = opt["value"]
                formats.append((format, val))

            except Exception, ex:
                log(SEV_EXC, exceptionAsStr(ex))
        formats.sort()
        return (url, title, subtitle, author, book_id, code, formats)
Ejemplo n.º 10
0
def _update_cache():
    global _g_currency_cache, _g_currency_cache_lock, _g_retrieve_functions, _g_tracked_currencies, _g_cache_update_interval

    t = threading.Timer(_g_cache_update_interval, _update_cache)
    t.start()

    out = {}
    tracked = {}
    tracked.update(_g_tracked_currencies)
    for func in _g_retrieve_functions:
        if 0 == len(tracked):
            break
        try:
            res, data = func()
            if RESULTS_DATA != res:
                log(SEV_MED, "currency parser: %s returned result: %d" % (str(func), res))
                continue
            for item in data.iteritems():
                key, value = item

                #if key in ["KRW"]:
                #    print "Func: %s" % (str(func))
                #    print "Key: %s   Value: %s " % (key, str(value))
                if tracked.has_key(key) and 0 != value:
                    out[key] = value
                    del tracked[key]
        except Exception, ex:
            log(SEV_EXC, exceptionAsStr(ex))
Ejemplo n.º 11
0
def _retrieve_xe():
    global _g_xe_url
    formData = {
        "basecur": "USD",
        "historical": "false",
        "month": "1",
        "day": "1",
        "year": "2004",
        "sort_by": "code",
        "template": "ict-en"
    }
    encFormData = urllib.urlencode(formData)
    headers = {
        #"Host": getHostFromUrl(_g_xe_url),
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
        "Referer": _g_xe_url
    }
    request = urllib2.Request(_g_xe_url, encFormData, headers)
    opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
    htmlText = None
    result = None
    try:
        result = opener.open(request)
        htmlText = result.read()
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (_g_xe_url, txt))
Ejemplo n.º 12
0
def retrieveHttpResponseWithRedirectHandleExceptionRetry(url, retryCount=3):
    while True:
        try:
            #log(SEV_LOW, "retrieveHttpResponseWithRedirectHandleExceptionRetry: %s\n" % url)
            status, reason, responseText = retrieveHttpResponseWithRedirection(
                url)
        except socket.error, (err, txt):
            retryCount -= 1
            #txt = exceptionAsStr(ex)
            log(
                SEV_EXC,
                "failed to retrieve data for '%s'\nsocket error:%d, %s\n" %
                (url, err, txt))
            if retryCount < 0:
                log(
                    SEV_EXC,
                    "failed to retrieve data for '%s'\ntoo many socket errors\n"
                    % (url))
                return None
            continue
        # TODO: add handling of urllib2.URLError?
        #   File "C:\Python22\lib\urllib2.py", line 809, in do_open
        # raise URLError(err)
        # URLError: <urlopen error (10060, 'Operation timed out')>
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC,
                "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
            return None
Ejemplo n.º 13
0
def _update_cache():
    global _g_currency_cache, _g_currency_cache_lock, _g_retrieve_functions, _g_tracked_currencies, _g_cache_update_interval

    t = threading.Timer(_g_cache_update_interval, _update_cache)
    t.start()

    out = {}
    tracked = {}
    tracked.update(_g_tracked_currencies)
    for func in _g_retrieve_functions:
        if 0 == len(tracked):
            break
        try:
            res, data = func()
            if RESULTS_DATA != res:
                log(
                    SEV_MED, "currency parser: %s returned result: %d" %
                    (str(func), res))
                continue
            for item in data.iteritems():
                key, value = item

                #if key in ["KRW"]:
                #    print "Func: %s" % (str(func))
                #    print "Key: %s   Value: %s " % (key, str(value))
                if tracked.has_key(key) and 0 != value:
                    out[key] = value
                    del tracked[key]
        except Exception, ex:
            log(SEV_EXC, exceptionAsStr(ex))
Ejemplo n.º 14
0
def retrieveUsers():
    global g_lastUserId, g_userStats
    cursor = None
    conn = getConnection()
    try:
        sql = "SELECT user_id, device_info, DATE_FORMAT(cookie_issue_date, '%Y-%m-%d'), reg_code, DATE_FORMAT(registration_date, '%Y-%m-%d'), disabled_p FROM users ORDER BY user_id;";
        #sql = "SELECT user_id, device_info, DATE_FORMAT(cookie_issue_date, '%Y-%m-%d'), reg_code, DATE_FORMAT(registration_date, '%Y-%m-%d'), disabled_p FROM users WHERE user_id > " + str(g_lastUserId) + "ORDER BY user_id;";

        cursor = conn.cursor()
        cursor.execute(sql)

        processed = 0
        prev_id = -1
        while True:
            row = cursor.fetchone()
            if None == row:
                break
            userData = UserData()
            userData.user_id = row[0]
            userData.device_info = row[1]
            userData.cookie_issue_date = row[2]
            userData.reg_code = row[3]
            userData.registration_date = row[4]
            userData.disabled_p = row[5]

            if userData.user_id > g_lastUserId:
                userData.new_user_p = True
            else:
                userData.new_user_p = False

            assert userData.user_id > prev_id 
            prev_id = userData.user_id

            g_userStats.append(userData)
            if userData.user_id > g_lastUserId:
                g_lastUserId = userData.user_id
            processed += 1
        cursor.close()

        # print "processed %d requests" % processed
    except _mysql_exceptions.Error, ex:
        if cursor:
            cursor.close()
        #log(SEV_HI, arsutils.exceptionAsStr(ex))
        print "exception in retrieveUsers()"
        print arsutils.exceptionAsStr(ex)
Ejemplo n.º 15
0
def convertArticle(term, text):
    try:
        text = text.replace('__NOTOC__', '')
        text = fixSup2(text)
        text = removeImageRx(text)
        # remove categories. TODO: provide a better support for categories
        # i.e. we remember categories on the server and client can display
        # all articles in a given category
        #        text=replaceRegExp(text, categoryRe, '')
        text = replaceWikiMacros(text)
        # remove remaining templates. TODO: better support for templates
        # in wikipedia template text is replaced by a page from Template:
        # namespace
        text = replaceRegExp(text, wikiTemplateRe, '')
        text = text.replace('\r', '')
        text = replaceRegExp(
            text, commentRe, ''
        )  # This should be safe, as it's illegal in html to nest comments

        text = stripTagBlocks(text, 'div')
        text = stripTagBlocks(text, 'table')
        text = stripBlocks(text, r'\{\|', r'\|\}')

        text = replaceRegExp(text, scriptRe, '')

        text = replaceTagList(text, ['b', 'strong'], "'''")
        text = replaceTagList(text, ['em', 'i', 'cite'], "''")
        text = replaceTagList(text, ['hr'], '----')
        text = replaceTagList(text, ['p'], '<br>')
        text = replaceTagList(text, [
            'dfn', 'code', 'samp', 'kbd', 'var', 'abbr', 'acronym',
            'blockquote', 'q', 'pre', 'ins', 'del', 'dir', 'menu', 'img',
            'object', 'big', 'span', 'applet', 'font', 'basefont', 'tr', 'td',
            'table', 'center', 'div'
        ], '')
        text = replaceRegExp(text, badLinkRe, '', supportedLanguagesRe())
        text = entities.convertNamedEntities(term, text)
        text = entities.convertNumberedEntities(term, text)
        text = stripMultipleNewLines(text)
        text = text.strip()
        text += '\n'
        return text
    except Exception, ex:
        print "Exception while converting term: ", term
        print arsutils.exceptionAsStr(ex)
        return ''
Ejemplo n.º 16
0
 def run(self):
     global g_lupyIndex
     print "Thread start (dict lupy index)"
     try:
         g_lupyIndex.initialize()
     except Exception, ex:
         txt = arsutils.exceptionAsStr(ex)
         log(SEV_EXC, "exception in lupy index dictionary\n%s\n" % (txt))
Ejemplo n.º 17
0
 def run(self):
     global g_lupyIndex
     print "Thread start (dict lupy index)"
     try:
         g_lupyIndex.initialize()
     except Exception, ex:
         txt = arsutils.exceptionAsStr(ex)
         log(SEV_EXC, "exception in lupy index dictionary\n%s\n" % (txt))
Ejemplo n.º 18
0
def retrieveHttpResponseHandleException(url):
    #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url)
    try:
        status, reason, responseText = retrieveHttpResponse(url)
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
        return None
Ejemplo n.º 19
0
 def runTestCase(self, prepAndTest):
     testName, prepFun, testFun=prepAndTest
     try:
         request=prepFun()
         response=retrieveResponse(self.address, request)
         self.extractFields(response)
         try:
             testFun()
             stdout.write('.')
         except TestAssertionFailed, ex:
             print "\n--------------------------------------------------------------------------------"
             print "Test case %s FAILED: %s" % (testName, ex.cause)
             print "--------------------------------------------------------------------------------"
     except Exception, ex:
         print "\n--------------------------------------------------------------------------------"
         print "Test case %s caused Exception.\n" % (testName)
         print exceptionAsStr(ex)
         print "--------------------------------------------------------------------------------"
Ejemplo n.º 20
0
 def runTestCase(self, prepAndTest):
     testName, prepFun, testFun = prepAndTest
     try:
         request = prepFun()
         response = retrieveResponse(self.address, request)
         self.extractFields(response)
         try:
             testFun()
             stdout.write('.')
         except TestAssertionFailed, ex:
             print "\n--------------------------------------------------------------------------------"
             print "Test case %s FAILED: %s" % (testName, ex.cause)
             print "--------------------------------------------------------------------------------"
     except Exception, ex:
         print "\n--------------------------------------------------------------------------------"
         print "Test case %s caused Exception.\n" % (testName)
         print exceptionAsStr(ex)
         print "--------------------------------------------------------------------------------"
Ejemplo n.º 21
0
def retrieveHttpResponseHandleException(url):
    #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url)
    try:
        status, reason, responseText = retrieveHttpResponse(url)
    except Exception, ex:
        txt = exceptionAsStr(ex)
        log(SEV_EXC,
            "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
        return None
Ejemplo n.º 22
0
def _getHttpHandleExceptionRetry(url, postData, handleRedirect, dbgLevel, referer, retryCount, cookieJar):
    assert retryCount > 0
    while retryCount > 0:
        try:
            htmlTxt = _getHttpHelper(url, postData, handleRedirect, dbgLevel, referer, cookieJar)
            return htmlTxt
        except socket.error, (err,txt):
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt))
            retryCount -= 1
Ejemplo n.º 23
0
def retrieveInternational(code):
    global _g_retrieve_international
    for func in _g_retrieve_international:
        try:
            res, data = func(code)
            if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
Ejemplo n.º 24
0
def retrieveAreaCodeByCity(city, state):
    global _g_retrieve_areaCodeByCity
    for func in _g_retrieve_areaCodeByCity:
        try:
            res, data = func(city, state)
            if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
Ejemplo n.º 25
0
def retrievePerson(firstName,lastName,cityOrZip,state):
    global _g_retrieve_person
    for func in _g_retrieve_person:
        try:
            res, data = func(firstName,lastName,cityOrZip,state)
            if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
Ejemplo n.º 26
0
def retrieveReversePhone(xxx,yyy,zzzz):
    global _g_retrieve_reversePhone
    for func in _g_retrieve_reversePhone:
        try:
            res, data = func(xxx,yyy,zzzz)
            if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
Ejemplo n.º 27
0
def retrieveBusiness(name,cityOrZip,state,surrounding,categoryOrName):
    global _g_retrieve_business
    for func in _g_retrieve_business:
        try:
            res, data = func(name,cityOrZip,state,surrounding,categoryOrName)
            if res not in [RETRIEVE_FAILED, UNKNOWN_FORMAT]:
                return res, data
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
Ejemplo n.º 28
0
 def find(self, book_id):
     self._lock.acquire()
     file_name = None
     try:
         file_name = self._cache.get(book_id, None)
         if file_name is None:
             self._lock.release()
             return None;
         f = file(file_name, "rb")
         self._lock.release()
         return f
     except Exception, ex:
         log(SEV_EXC, exceptionAsStr(ex))
         if file_name is not None:
             try:
                 os.remove(file_name)
             except Exception, ex1:
                 log(SEV_EXC, exceptionAsStr(ex1))
                 pass
Ejemplo n.º 29
0
def utf8ToLatin1(text):
    decoded = text
    try:
        decoded = text.decode("utf_8")
    except ValueError, ex:
        sys.stdout.write("exception while decoding utf-8\n")
        sys.stdout.write("%s\n" % arsutils.exceptionAsStr(ex))
        sys.stdout.write('\n')
        sys.stdout.write(text[:240])
        sys.stdout.write('\n')
        return text
Ejemplo n.º 30
0
def utf8ToLatin1(text):
    decoded = text
    try:
        decoded = text.decode("utf_8")
    except ValueError, ex:
        sys.stdout.write("exception while decoding utf-8\n")
        sys.stdout.write("%s\n" % arsutils.exceptionAsStr(ex))
        sys.stdout.write("\n")
        sys.stdout.write(text[:240])
        sys.stdout.write("\n")
        return text
Ejemplo n.º 31
0
 def add(self, book_id, file_name):
     self._lock.acquire()
     try:
         cached_name = self._cache.get(book_id, None)
         if cached_name is not None:
             if cached_name == file_name:
                 return
             else:
                 self._cache[book_id] = file_name
                 self._pickle_out()
                 try:
                     os.remove(cached_name)
                 except Exception, ex:
                     log(SEV_EXC, exceptionAsStr(ex))
         else:
Ejemplo n.º 32
0
def initDictionary():
    global g_wnFo, g_wnDictPath, g_wnDefReadLock, g_random, g_fInitialized, g_fDisabled
    global g_thFo, g_thDictPath, g_thDefReadLock
    if g_fInitialized:
        return
    g_fInitialized = True
    g_fDisabled = True
    if not loadPickledFiles():
        return
    try:
        assert None == g_wnFo
        g_wnFo = open(g_wnDictPath, "rb")
        assert None == g_wnDefReadLock
        g_wnDefReadLock = Lock()
        assert None == g_thFo
        g_thFo = open(g_thDictPath, "rb")
        assert None == g_thDefReadLock
        g_thDefReadLock = Lock()
        assert None == g_random
        g_random = random.Random()
        g_random.seed()
    except Exception, ex:
        print arsutils.exceptionAsStr(ex)
        return
Ejemplo n.º 33
0
def _getHttpHandleExceptionRetry(url, postData, handleRedirect, dbgLevel,
                                 referer, retryCount, cookieJar):
    assert retryCount > 0
    while retryCount > 0:
        try:
            htmlTxt = _getHttpHelper(url, postData, handleRedirect, dbgLevel,
                                     referer, cookieJar)
            return htmlTxt
        except socket.error, (err, txt):
            txt = exceptionAsStr(ex)
            log(
                SEV_EXC,
                "failed to retrieve data for '%s'\nsocket error:%d, %s\n" %
                (url, err, txt))
            retryCount -= 1
Ejemplo n.º 34
0
def initDictionary():
    global g_wnFo, g_wnDictPath, g_wnDefReadLock, g_random, g_fInitialized, g_fDisabled
    global g_thFo, g_thDictPath, g_thDefReadLock
    if g_fInitialized:
        return
    g_fInitialized = True
    g_fDisabled = True
    if not loadPickledFiles():
        return
    try:
        assert None == g_wnFo
        g_wnFo = open(g_wnDictPath, "rb")
        assert None == g_wnDefReadLock
        g_wnDefReadLock = Lock()
        assert None == g_thFo
        g_thFo = open(g_thDictPath, "rb")
        assert None == g_thDefReadLock
        g_thDefReadLock = Lock()
        assert None == g_random
        g_random = random.Random()
        g_random.seed()
    except Exception, ex:
        print arsutils.exceptionAsStr(ex)
        return
Ejemplo n.º 35
0
def retrieveHttpResponseWithRedirectHandleExceptionRetry(url,retryCount=3):
    while True:
        try:
            #log(SEV_LOW, "retrieveHttpResponseWithRedirectHandleExceptionRetry: %s\n" % url)
            status, reason, responseText = retrieveHttpResponseWithRedirection(url)
        except socket.error, (err,txt):
            retryCount -= 1
            #txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt))
            if retryCount < 0:
                log(SEV_EXC, "failed to retrieve data for '%s'\ntoo many socket errors\n" % (url))
                return None
            continue
        # TODO: add handling of urllib2.URLError?
        #   File "C:\Python22\lib\urllib2.py", line 809, in do_open
        # raise URLError(err)
        # URLError: <urlopen error (10060, 'Operation timed out')>
        except Exception, ex:
            txt = exceptionAsStr(ex)
            log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt))
            return None
Ejemplo n.º 36
0
        return False

    if not arsutils.fFileExists(g_thWordsPath):
        print "Thesaurus dictionary file '%s' doesn't exist" % g_thWordsPath
        return False

    try:
        fo = open(g_thIndexPath, "rb")
        g_thWordIndex = cPickle.load(fo)
        fo.close()

        fo = open(g_thWordsPath, "rb")
        g_thWords = cPickle.load(fo)
        fo.close()
    except Exception, ex:
        print arsutils.exceptionAsStr(ex)
        return False
    print "Finished loading Thesaurus files"
    return True


def initDictionary():
    global g_wnFo, g_wnDictPath, g_wnDefReadLock, g_random, g_fInitialized, g_fDisabled
    global g_thFo, g_thDictPath, g_thDefReadLock
    if g_fInitialized:
        return
    g_fInitialized = True
    g_fDisabled = True
    if not loadPickledFiles():
        return
    try:
Ejemplo n.º 37
0
    def _retrieveHours(self, hours, fast):
        cursor = self._conn.cursor()
        try:
            start = time.mktime((self._date.tm_year, self._date.tm_mon, self._date.tm_mday, 0 , 0, 0, -1, -1, -1))
            end = start + 3600 * 24
            cursor.execute("SELECT date FROM zap2it_cached_data WHERE provider_id = %d AND date >= '%s' AND date < '%s'" %
                           (self._provider, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start)), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end))))
            for row in cursor:
                h = row[0].timetuple()[3]
                hours.remove(h)
        finally:
            cursor.close()

        # if all data is cached, return None and use _retrieveFromDatabase() instead
        if 0 == len(hours):
            return None;

        opener = None
        if not self._has_grid:
            response, opener = _zap2it_retrieve_grid(self._jar, self._zipCode, self._provider)
            self._has_grid = True
            response.close()
        else:
            opener = opener = _zap2it_opener(self._jar)

        if not fast:
            for h in hours:
                self._events[h] = threading.Event()
                date = time.localtime(time.mktime((self._date.tm_year, self._date.tm_mon, self._date.tm_mday, h , 0, 0, -1, -1, -1)))
                _g_zap2it_cache_manager._addActiveRetriever(_zap2it_retriever_key(self._provider, date), self)

        rows = 0
        duration = 1
        if fast:
            rows = 20
            duration = 3
            hours = [hours[0]]

        out = []

        formData = {
            "displayType": "Text",
            "duration": str(duration),
            "startDay": time.strftime("%m/%d/%Y", self._date),
            "category": "0",
            "station": "0",
            "rowdisplay": str(rows),
            "goButton": "GO"
        }

        stations = {}
        programs = {}
        for h in hours:
            formData["startTime"] = str(h)
            date = time.localtime(time.mktime((self._date.tm_year, self._date.tm_mon, self._date.tm_mday, h , 0, 0, -1, -1, -1)))
            encData = urllib.urlencode(formData)
            request = urllib2.Request(_g_zap2it_listings_url, encData)
            request.add_header("Referer", _g_zap2it_grid_url)
            response = opener.open(request)
            htmlText = None
            try:
                _zap2it_tracker(self._jar)
                contentLength = long(response.info()["Content-Length"])
                htmlText = response.read(contentLength)
                if fast:
                    out.extend(_zap2it_parse_listings(None, htmlText, date, self._provider, stations, programs))
                else:
                    _zap2it_parse_listings(self._conn,  htmlText, date, self._provider, stations, programs)
            except Exception, ex:
                # todo: log exception
                print exceptionAsStr(ex)
                f = file(time.strftime('tvlistings-%Y%m%dT%H%M%S.html'), 'wb')
                f.write(htmlText)
                f.close()
                pass
            response.close()
            if not fast:
                self._events[h].set()
                _g_zap2it_cache_manager._removeActiveRetriever(_zap2it_retriever_key(self._provider, date))
Ejemplo n.º 38
0
        return False

    if not arsutils.fFileExists(g_thWordsPath):
        print "Thesaurus dictionary file '%s' doesn't exist" % g_thWordsPath
        return False

    try:
        fo = open(g_thIndexPath, "rb")
        g_thWordIndex = cPickle.load(fo)
        fo.close()

        fo = open(g_thWordsPath, "rb")
        g_thWords = cPickle.load(fo)
        fo.close()
    except Exception, ex:
        print arsutils.exceptionAsStr(ex)
        return False
    print "Finished loading Thesaurus files"
    return True


def initDictionary():
    global g_wnFo, g_wnDictPath, g_wnDefReadLock, g_random, g_fInitialized, g_fDisabled
    global g_thFo, g_thDictPath, g_thDefReadLock
    if g_fInitialized:
        return
    g_fInitialized = True
    g_fDisabled = True
    if not loadPickledFiles():
        return
    try:
Ejemplo n.º 39
0
def iterWikipediaArticles(sqlFileName, limit=None, fUseCache=False, fRecreateCache=False):
    # if limit:
    #    assert fUseCache==False
    print "fUseCache %d, fRecreateCache=%d" % (fUseCache, fRecreateCache)
    cacheWriter = None
    fReallyUseCache = False
    if fRecreateCache:
        cacheWriter = ArticleCacheWriter(sqlFileName)
        cacheWriter.open()
    else:
        if fUseCache and fCacheExists(sqlFileName):
            fReallyUseCache = True
        else:
            cacheWriter = ArticleCacheWriter(sqlFileName)
            cacheWriter.open()

    lang = os.path.basename(sqlFileName)[:2]
    print "database dump language: ", lang
    isUtf8 = False

    if lang in wikiToDbConvert.g_utf8Languages:
        isUtf8 = True

    if isUtf8:
        print "performing UTF-8 to Latin-1 conversion"

    if fReallyUseCache:
        fileName = getIdxFileName(sqlFileName)
        print "getting articles from cache %s" % fileName
        fo = open(fileName, "rb")
        count = 0
        while True:
            title = fo.readline()
            if len(title) == 0:
                break
            if fIsRedirectLine(title):
                redirect = fo.readline()
                title = title.strip()
                if title == REDIRECT_MARK:
                    # need this to remove stupid redirecto of 0xa0=>Space_(punctuation)
                    print "title after stripping is equal to '%s' (REDIRECT_MARK), so skipping" % REDIRECT_MARK
                    continue
                title = title[len(REDIRECT_MARK) :]
                if len(title) == 0:
                    print "title after stripping is empty string, so skipping '%s'" % redirect
                    continue
                article = WikipediaArticleRedirect(title, redirect.strip())
            else:
                title = title.strip()
                line = fo.readline()
                if len(title) == 0:
                    print "title after stripping is empty string, so skipping '%s'" % line.strip()
                    continue
                lineParts = line.split(",")
                try:
                    ns = int(lineParts[0])
                    assert ns == NS_MAIN
                    txtOffset = int(lineParts[1])
                    txtLen = int(lineParts[2])
                    md5Hash = lineParts[3]
                    viewCount = int(lineParts[4])
                    article = WikipediaArticleFromCache(sqlFileName, title, ns, txtOffset, txtLen, md5Hash, viewCount)
                except ValueError, ex:
                    # in en 2004-09-17 db we have an error in ns = int(lineParts[0]), so just ignore it
                    print "exception in iterWikipediaArticles"
                    print arsutils.exceptionAsStr(ex)
                    print "title:_%s_" % title
                    print "line:_%s_" % line
                    print "lineParts[0]=%s" % lineParts[0]
                    continue

            yield article
            count += 1
            if limit and count > limit:
                break
        fo.close()
        return
Ejemplo n.º 40
0
def iterWikipediaArticles(sqlFileName,
                          limit=None,
                          fUseCache=False,
                          fRecreateCache=False):
    #if limit:
    #    assert fUseCache==False
    print "fUseCache %d, fRecreateCache=%d" % (fUseCache, fRecreateCache)
    cacheWriter = None
    fReallyUseCache = False
    if fRecreateCache:
        cacheWriter = ArticleCacheWriter(sqlFileName)
        cacheWriter.open()
    else:
        if fUseCache and fCacheExists(sqlFileName):
            fReallyUseCache = True
        else:
            cacheWriter = ArticleCacheWriter(sqlFileName)
            cacheWriter.open()

    lang = os.path.basename(sqlFileName)[:2]
    print "database dump language: ", lang
    isUtf8 = False

    if lang in wikiToDbConvert.g_utf8Languages:
        isUtf8 = True

    if isUtf8:
        print "performing UTF-8 to Latin-1 conversion"

    if fReallyUseCache:
        fileName = getIdxFileName(sqlFileName)
        print "getting articles from cache %s" % fileName
        fo = open(fileName, "rb")
        count = 0
        while True:
            title = fo.readline()
            if len(title) == 0:
                break
            if fIsRedirectLine(title):
                redirect = fo.readline()
                title = title.strip()
                if title == REDIRECT_MARK:
                    #need this to remove stupid redirecto of 0xa0=>Space_(punctuation)
                    print "title after stripping is equal to '%s' (REDIRECT_MARK), so skipping" % REDIRECT_MARK
                    continue
                title = title[len(REDIRECT_MARK):]
                if len(title) == 0:
                    print "title after stripping is empty string, so skipping '%s'" % redirect
                    continue
                article = WikipediaArticleRedirect(title, redirect.strip())
            else:
                title = title.strip()
                line = fo.readline()
                if len(title) == 0:
                    print "title after stripping is empty string, so skipping '%s'" % line.strip(
                    )
                    continue
                lineParts = line.split(",")
                try:
                    ns = int(lineParts[0])
                    assert ns == NS_MAIN
                    txtOffset = int(lineParts[1])
                    txtLen = int(lineParts[2])
                    md5Hash = lineParts[3]
                    viewCount = int(lineParts[4])
                    article = WikipediaArticleFromCache(
                        sqlFileName, title, ns, txtOffset, txtLen, md5Hash,
                        viewCount)
                except ValueError, ex:
                    # in en 2004-09-17 db we have an error in ns = int(lineParts[0]), so just ignore it
                    print "exception in iterWikipediaArticles"
                    print arsutils.exceptionAsStr(ex)
                    print "title:_%s_" % title
                    print "line:_%s_" % line
                    print "lineParts[0]=%s" % lineParts[0]
                    continue

            yield article
            count += 1
            if limit and count > limit:
                break
        fo.close()
        return
Ejemplo n.º 41
0
def convertArticle(term, text):
    try:
        text = text.replace("__NOTOC__", "")
        text = fixSup2(text)
        text = removeImageRx(text)
        # remove categories. TODO: provide a better support for categories
        # i.e. we remember categories on the server and client can display
        # all articles in a given category
        #        text=replaceRegExp(text, categoryRe, '')
        text = replaceWikiMacros(text)
        # remove remaining templates. TODO: better support for templates
        # in wikipedia template text is replaced by a page from Template:
        # namespace
        text = replaceRegExp(text, wikiTemplateRe, "")
        text = text.replace("\r", "")
        text = replaceRegExp(text, commentRe, "")  # This should be safe, as it's illegal in html to nest comments

        text = stripTagBlocks(text, "div")
        text = stripTagBlocks(text, "table")
        text = stripBlocks(text, r"\{\|", r"\|\}")

        text = replaceRegExp(text, scriptRe, "")

        text = replaceTagList(text, ["b", "strong"], "'''")
        text = replaceTagList(text, ["em", "i", "cite"], "''")
        text = replaceTagList(text, ["hr"], "----")
        text = replaceTagList(text, ["p"], "<br>")
        text = replaceTagList(
            text,
            [
                "dfn",
                "code",
                "samp",
                "kbd",
                "var",
                "abbr",
                "acronym",
                "blockquote",
                "q",
                "pre",
                "ins",
                "del",
                "dir",
                "menu",
                "img",
                "object",
                "big",
                "span",
                "applet",
                "font",
                "basefont",
                "tr",
                "td",
                "table",
                "center",
                "div",
            ],
            "",
        )
        text = replaceRegExp(text, badLinkRe, "", supportedLanguagesRe())
        text = entities.convertNamedEntities(term, text)
        text = entities.convertNumberedEntities(term, text)
        text = stripMultipleNewLines(text)
        text = text.strip()
        text += "\n"
        return text
    except Exception, ex:
        print "Exception while converting term: ", term
        print arsutils.exceptionAsStr(ex)
        return ""
Ejemplo n.º 42
0
def spider_last_modified():
    return os.path.getmtime(_g_spider_data_path)

def reindex(index, data = None):
    data = _load_spider_data()

    for letter, letter_data in data.iteritems():
        print "Indexing %d books for letter '%s'." % (letter_data[0], letter)
        for book in letter_data[1]:
            url, title, subtitle, author, book_id, code, formats = book
            title = _decode(title)
            subtitle = _decode(subtitle)
            author = _transform_author(_decode(author))
            book_id = _decode(book_id)
            code = _decode(code)
            formats.sort()
            index.index_ebook(title, subtitle, author, book_id, formats, PROVIDER_ID, code)

try:
    import psyco
    psyco.bind(_Spider._merge_temps)
    psyco.bind(_Spider._spider_letter_range)
except Exception, ex:
    print exceptionAsStr(ex)
except ImportError:
    print "psyco not available. You should consider using it (http://psyco.sourceforge.net/)"

if __name__ == "__main__":
    spider()