def loadPickledFiles(): global g_wnWords, g_wnWordIndex, g_wnDictPath, g_wnIndexPath, g_wnWordsPath print "loading wn dictionary data files" if not arsutils.fFileExists(g_wnDictPath): print "WordNet dictionary file '%s' doesn't exist" % g_wnDictPath return False if not arsutils.fFileExists(g_wnIndexPath): print "WordNet dictionary file '%s' doesn't exist" % g_wnIndexPath return False if not arsutils.fFileExists(g_wnWordsPath): print "WordNet dictionary file '%s' doesn't exist" % g_wnWordsPath return False try: fo = open(g_wnIndexPath, "rb") g_wnWordIndex = cPickle.load(fo) fo.close() fo = open(g_wnWordsPath, "rb") g_wnWords = cPickle.load(fo) fo.close() except Exception, ex: print arsutils.exceptionAsStr(ex) return False
def _merge_temps(self): if not os.path.exists(ebooks.g_storage): return temps = [os.path.join(ebooks.g_storage, name) for name in os.listdir(ebooks.g_storage) if self.temp_file_pattern.match(name)] if 0 == len(temps): return print "Merging temporary segments." temps.sort() try: for temp in temps: f = file(temp, "rb") try: data = cPickle.load(f) finally: f.close() os.remove(temp) for letter, index, book in data: letter_data = self._data.get(letter, None) if letter_data is None: letter_data = [0, []] self._data[letter] = letter_data assert index <= letter_data[0] letter_data[1].insert(index, book) letter_data[0] += 1 except Exception, ex: print exceptionAsStr(ex)
def run(self): try: self.count = self._spider._spider_letter_range(self._letters) except _StopSpidering: return except Exception, ex: print exceptionAsStr(ex) self._spider._finish.set()
def retrieveRequests(): global g_lastRequestLogId, g_dailyStats, g_modifiedDays cursor = None conn = getConnection() try: if None == g_lastRequestLogId: sql = "SELECT request_id, user_id, DATE_FORMAT(log_date, '%Y-%m-%d'), free_p, request, result, error FROM request_log ORDER BY request_id;" else: sql = "SELECT request_id, user_id, DATE_FORMAT(log_date, '%Y-%m-%d'), free_p, request, result, error FROM request_log WHERE request_id > " + str( g_lastRequestLogId) + " ORDER BY request_id;" cursor = conn.cursor() cursor.execute(sql) processed = 0 prev_id = -1 while True: row = cursor.fetchone() if None == row: break reqData = RequestData() reqData.request_id = row[0] reqData.user_id = row[1] reqData.log_date = row[2] reqData.free_p = row[3] reqData.request = row[4] reqData.result = row[5] reqData.error = row[ 6] # it's either a number or None if there was no error assert reqData.request_id > prev_id prev_id = reqData.request_id logDate = reqData.log_date if g_dailyStats.has_key(logDate): g_dailyStats[logDate].append(reqData) else: g_dailyStats[logDate] = [reqData] if not g_modifiedDays.has_key(logDate): g_modifiedDays[logDate] = 1 if reqData.request_id > g_lastRequestLogId: g_lastRequestLogId = reqData.request_id processed += 1 cursor.close() # print "processed %d requests" % processed except _mysql_exceptions.Error, ex: if cursor: cursor.close() #log(SEV_HI, arsutils.exceptionAsStr(ex)) print "exception in retrieveRequests()" print arsutils.exceptionAsStr(ex)
def retrieveRequests(): global g_lastRequestLogId, g_dailyStats, g_modifiedDays cursor = None conn = getConnection() try: if None == g_lastRequestLogId: sql = "SELECT request_id, user_id, DATE_FORMAT(log_date, '%Y-%m-%d'), free_p, request, result, error FROM request_log ORDER BY request_id;" else: sql = "SELECT request_id, user_id, DATE_FORMAT(log_date, '%Y-%m-%d'), free_p, request, result, error FROM request_log WHERE request_id > " + str(g_lastRequestLogId) + " ORDER BY request_id;" cursor = conn.cursor() cursor.execute(sql) processed = 0 prev_id = -1 while True: row = cursor.fetchone() if None == row: break reqData = RequestData() reqData.request_id = row[0] reqData.user_id = row[1] reqData.log_date = row[2] reqData.free_p = row[3] reqData.request = row[4] reqData.result = row[5] reqData.error = row[6] # it's either a number or None if there was no error assert reqData.request_id > prev_id prev_id = reqData.request_id logDate = reqData.log_date if g_dailyStats.has_key(logDate): g_dailyStats[logDate].append(reqData) else: g_dailyStats[logDate] = [reqData] if not g_modifiedDays.has_key(logDate): g_modifiedDays[logDate] = 1 if reqData.request_id > g_lastRequestLogId: g_lastRequestLogId = reqData.request_id processed += 1 cursor.close() # print "processed %d requests" % processed except _mysql_exceptions.Error, ex: if cursor: cursor.close() #log(SEV_HI, arsutils.exceptionAsStr(ex)) print "exception in retrieveRequests()" print arsutils.exceptionAsStr(ex)
def retrieveUsers(): global g_lastUserId, g_userStats cursor = None conn = getConnection() try: sql = "SELECT user_id, device_info, DATE_FORMAT(cookie_issue_date, '%Y-%m-%d'), reg_code, DATE_FORMAT(registration_date, '%Y-%m-%d'), disabled_p FROM users ORDER BY user_id;" #sql = "SELECT user_id, device_info, DATE_FORMAT(cookie_issue_date, '%Y-%m-%d'), reg_code, DATE_FORMAT(registration_date, '%Y-%m-%d'), disabled_p FROM users WHERE user_id > " + str(g_lastUserId) + "ORDER BY user_id;"; cursor = conn.cursor() cursor.execute(sql) processed = 0 prev_id = -1 while True: row = cursor.fetchone() if None == row: break userData = UserData() userData.user_id = row[0] userData.device_info = row[1] userData.cookie_issue_date = row[2] userData.reg_code = row[3] userData.registration_date = row[4] userData.disabled_p = row[5] if None == userData.registration_date: userData.fRegistered = False else: userData.fRegistered = True if userData.user_id > g_lastUserId: userData.new_user_p = True else: userData.new_user_p = False assert userData.user_id > prev_id prev_id = userData.user_id g_userStats.append(userData) if userData.user_id > g_lastUserId: g_lastUserId = userData.user_id processed += 1 cursor.close() # print "processed %d requests" % processed except _mysql_exceptions.Error, ex: if cursor: cursor.close() #log(SEV_HI, arsutils.exceptionAsStr(ex)) print "exception in retrieveUsers()" print arsutils.exceptionAsStr(ex)
def _retrieve_xe(): global _g_xe_url formData = { "basecur": "USD", "historical": "false", "month": "1", "day": "1", "year": "2004", "sort_by": "code", "template": "ict-en" } encFormData = urllib.urlencode(formData) headers = { #"Host": getHostFromUrl(_g_xe_url), "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)", "Referer": _g_xe_url } request = urllib2.Request(_g_xe_url, encFormData, headers) opener = urllib2.build_opener(urllib2.HTTPRedirectHandler()) htmlText = None result = None try: result = opener.open(request) htmlText = result.read() except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (_g_xe_url, txt))
def _spider_book_info(url, letter): try: html = getHttp(url, handleException = False) soup = BeautifulSoup() soup.feed(html) h1 = soup.first("h1") if h1 is None: return None assert h1 is not None title = retrieveContents(h1).decode("iso-8859-1") subtitle = None author = None code = None labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})] data = soup.fetch("span", {"class": "title-data"}) try: index = labels.index("Subtitle") subtitle = retrieveContents(data[index]).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Author") author = retrieveContents(data[index].first("a")).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Language") href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"]) code = href[19:href.find("&", 19)].decode("iso-8859-1") except ValueError: pass tid = soup.first("input", {"type": "hidden", "name": "tid"}) assert tid is not None book_id = tid["value"].decode("iso-8859-1") print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore") sel = soup.first("select", {"name": "book"}) assert sel is not None opts = sel.fetch("option") formats = [] for opt in opts: try: format = retrieveContents(opt).split()[0] if format not in ebooks.FORMATS: continue val = opt["value"] formats.append((format, val)) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) formats.sort() return (url, title, subtitle, author, book_id, code, formats)
def _update_cache(): global _g_currency_cache, _g_currency_cache_lock, _g_retrieve_functions, _g_tracked_currencies, _g_cache_update_interval t = threading.Timer(_g_cache_update_interval, _update_cache) t.start() out = {} tracked = {} tracked.update(_g_tracked_currencies) for func in _g_retrieve_functions: if 0 == len(tracked): break try: res, data = func() if RESULTS_DATA != res: log(SEV_MED, "currency parser: %s returned result: %d" % (str(func), res)) continue for item in data.iteritems(): key, value = item #if key in ["KRW"]: # print "Func: %s" % (str(func)) # print "Key: %s Value: %s " % (key, str(value)) if tracked.has_key(key) and 0 != value: out[key] = value del tracked[key] except Exception, ex: log(SEV_EXC, exceptionAsStr(ex))
def retrieveHttpResponseWithRedirectHandleExceptionRetry(url, retryCount=3): while True: try: #log(SEV_LOW, "retrieveHttpResponseWithRedirectHandleExceptionRetry: %s\n" % url) status, reason, responseText = retrieveHttpResponseWithRedirection( url) except socket.error, (err, txt): retryCount -= 1 #txt = exceptionAsStr(ex) log( SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt)) if retryCount < 0: log( SEV_EXC, "failed to retrieve data for '%s'\ntoo many socket errors\n" % (url)) return None continue # TODO: add handling of urllib2.URLError? # File "C:\Python22\lib\urllib2.py", line 809, in do_open # raise URLError(err) # URLError: <urlopen error (10060, 'Operation timed out')> except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt)) return None
def _update_cache(): global _g_currency_cache, _g_currency_cache_lock, _g_retrieve_functions, _g_tracked_currencies, _g_cache_update_interval t = threading.Timer(_g_cache_update_interval, _update_cache) t.start() out = {} tracked = {} tracked.update(_g_tracked_currencies) for func in _g_retrieve_functions: if 0 == len(tracked): break try: res, data = func() if RESULTS_DATA != res: log( SEV_MED, "currency parser: %s returned result: %d" % (str(func), res)) continue for item in data.iteritems(): key, value = item #if key in ["KRW"]: # print "Func: %s" % (str(func)) # print "Key: %s Value: %s " % (key, str(value)) if tracked.has_key(key) and 0 != value: out[key] = value del tracked[key] except Exception, ex: log(SEV_EXC, exceptionAsStr(ex))
def retrieveUsers(): global g_lastUserId, g_userStats cursor = None conn = getConnection() try: sql = "SELECT user_id, device_info, DATE_FORMAT(cookie_issue_date, '%Y-%m-%d'), reg_code, DATE_FORMAT(registration_date, '%Y-%m-%d'), disabled_p FROM users ORDER BY user_id;"; #sql = "SELECT user_id, device_info, DATE_FORMAT(cookie_issue_date, '%Y-%m-%d'), reg_code, DATE_FORMAT(registration_date, '%Y-%m-%d'), disabled_p FROM users WHERE user_id > " + str(g_lastUserId) + "ORDER BY user_id;"; cursor = conn.cursor() cursor.execute(sql) processed = 0 prev_id = -1 while True: row = cursor.fetchone() if None == row: break userData = UserData() userData.user_id = row[0] userData.device_info = row[1] userData.cookie_issue_date = row[2] userData.reg_code = row[3] userData.registration_date = row[4] userData.disabled_p = row[5] if userData.user_id > g_lastUserId: userData.new_user_p = True else: userData.new_user_p = False assert userData.user_id > prev_id prev_id = userData.user_id g_userStats.append(userData) if userData.user_id > g_lastUserId: g_lastUserId = userData.user_id processed += 1 cursor.close() # print "processed %d requests" % processed except _mysql_exceptions.Error, ex: if cursor: cursor.close() #log(SEV_HI, arsutils.exceptionAsStr(ex)) print "exception in retrieveUsers()" print arsutils.exceptionAsStr(ex)
def convertArticle(term, text): try: text = text.replace('__NOTOC__', '') text = fixSup2(text) text = removeImageRx(text) # remove categories. TODO: provide a better support for categories # i.e. we remember categories on the server and client can display # all articles in a given category # text=replaceRegExp(text, categoryRe, '') text = replaceWikiMacros(text) # remove remaining templates. TODO: better support for templates # in wikipedia template text is replaced by a page from Template: # namespace text = replaceRegExp(text, wikiTemplateRe, '') text = text.replace('\r', '') text = replaceRegExp( text, commentRe, '' ) # This should be safe, as it's illegal in html to nest comments text = stripTagBlocks(text, 'div') text = stripTagBlocks(text, 'table') text = stripBlocks(text, r'\{\|', r'\|\}') text = replaceRegExp(text, scriptRe, '') text = replaceTagList(text, ['b', 'strong'], "'''") text = replaceTagList(text, ['em', 'i', 'cite'], "''") text = replaceTagList(text, ['hr'], '----') text = replaceTagList(text, ['p'], '<br>') text = replaceTagList(text, [ 'dfn', 'code', 'samp', 'kbd', 'var', 'abbr', 'acronym', 'blockquote', 'q', 'pre', 'ins', 'del', 'dir', 'menu', 'img', 'object', 'big', 'span', 'applet', 'font', 'basefont', 'tr', 'td', 'table', 'center', 'div' ], '') text = replaceRegExp(text, badLinkRe, '', supportedLanguagesRe()) text = entities.convertNamedEntities(term, text) text = entities.convertNumberedEntities(term, text) text = stripMultipleNewLines(text) text = text.strip() text += '\n' return text except Exception, ex: print "Exception while converting term: ", term print arsutils.exceptionAsStr(ex) return ''
def run(self): global g_lupyIndex print "Thread start (dict lupy index)" try: g_lupyIndex.initialize() except Exception, ex: txt = arsutils.exceptionAsStr(ex) log(SEV_EXC, "exception in lupy index dictionary\n%s\n" % (txt))
def retrieveHttpResponseHandleException(url): #log(SEV_LOW, "retrieveHttpResponseHandleException: %s\n" % url) try: status, reason, responseText = retrieveHttpResponse(url) except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt)) return None
def runTestCase(self, prepAndTest): testName, prepFun, testFun=prepAndTest try: request=prepFun() response=retrieveResponse(self.address, request) self.extractFields(response) try: testFun() stdout.write('.') except TestAssertionFailed, ex: print "\n--------------------------------------------------------------------------------" print "Test case %s FAILED: %s" % (testName, ex.cause) print "--------------------------------------------------------------------------------" except Exception, ex: print "\n--------------------------------------------------------------------------------" print "Test case %s caused Exception.\n" % (testName) print exceptionAsStr(ex) print "--------------------------------------------------------------------------------"
def runTestCase(self, prepAndTest): testName, prepFun, testFun = prepAndTest try: request = prepFun() response = retrieveResponse(self.address, request) self.extractFields(response) try: testFun() stdout.write('.') except TestAssertionFailed, ex: print "\n--------------------------------------------------------------------------------" print "Test case %s FAILED: %s" % (testName, ex.cause) print "--------------------------------------------------------------------------------" except Exception, ex: print "\n--------------------------------------------------------------------------------" print "Test case %s caused Exception.\n" % (testName) print exceptionAsStr(ex) print "--------------------------------------------------------------------------------"
def _getHttpHandleExceptionRetry(url, postData, handleRedirect, dbgLevel, referer, retryCount, cookieJar): assert retryCount > 0 while retryCount > 0: try: htmlTxt = _getHttpHelper(url, postData, handleRedirect, dbgLevel, referer, cookieJar) return htmlTxt except socket.error, (err,txt): txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt)) retryCount -= 1
def retrieveInternational(code): global _g_retrieve_international for func in _g_retrieve_international: try: res, data = func(code) if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrieveAreaCodeByCity(city, state): global _g_retrieve_areaCodeByCity for func in _g_retrieve_areaCodeByCity: try: res, data = func(city, state) if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrievePerson(firstName,lastName,cityOrZip,state): global _g_retrieve_person for func in _g_retrieve_person: try: res, data = func(firstName,lastName,cityOrZip,state) if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrieveReversePhone(xxx,yyy,zzzz): global _g_retrieve_reversePhone for func in _g_retrieve_reversePhone: try: res, data = func(xxx,yyy,zzzz) if RETRIEVE_FAILED != res and UNKNOWN_FORMAT != res: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def retrieveBusiness(name,cityOrZip,state,surrounding,categoryOrName): global _g_retrieve_business for func in _g_retrieve_business: try: res, data = func(name,cityOrZip,state,surrounding,categoryOrName) if res not in [RETRIEVE_FAILED, UNKNOWN_FORMAT]: return res, data except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to parse data\nreason:%s\n" % (txt))
def find(self, book_id): self._lock.acquire() file_name = None try: file_name = self._cache.get(book_id, None) if file_name is None: self._lock.release() return None; f = file(file_name, "rb") self._lock.release() return f except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) if file_name is not None: try: os.remove(file_name) except Exception, ex1: log(SEV_EXC, exceptionAsStr(ex1)) pass
def utf8ToLatin1(text): decoded = text try: decoded = text.decode("utf_8") except ValueError, ex: sys.stdout.write("exception while decoding utf-8\n") sys.stdout.write("%s\n" % arsutils.exceptionAsStr(ex)) sys.stdout.write('\n') sys.stdout.write(text[:240]) sys.stdout.write('\n') return text
def utf8ToLatin1(text): decoded = text try: decoded = text.decode("utf_8") except ValueError, ex: sys.stdout.write("exception while decoding utf-8\n") sys.stdout.write("%s\n" % arsutils.exceptionAsStr(ex)) sys.stdout.write("\n") sys.stdout.write(text[:240]) sys.stdout.write("\n") return text
def add(self, book_id, file_name): self._lock.acquire() try: cached_name = self._cache.get(book_id, None) if cached_name is not None: if cached_name == file_name: return else: self._cache[book_id] = file_name self._pickle_out() try: os.remove(cached_name) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) else:
def initDictionary(): global g_wnFo, g_wnDictPath, g_wnDefReadLock, g_random, g_fInitialized, g_fDisabled global g_thFo, g_thDictPath, g_thDefReadLock if g_fInitialized: return g_fInitialized = True g_fDisabled = True if not loadPickledFiles(): return try: assert None == g_wnFo g_wnFo = open(g_wnDictPath, "rb") assert None == g_wnDefReadLock g_wnDefReadLock = Lock() assert None == g_thFo g_thFo = open(g_thDictPath, "rb") assert None == g_thDefReadLock g_thDefReadLock = Lock() assert None == g_random g_random = random.Random() g_random.seed() except Exception, ex: print arsutils.exceptionAsStr(ex) return
def _getHttpHandleExceptionRetry(url, postData, handleRedirect, dbgLevel, referer, retryCount, cookieJar): assert retryCount > 0 while retryCount > 0: try: htmlTxt = _getHttpHelper(url, postData, handleRedirect, dbgLevel, referer, cookieJar) return htmlTxt except socket.error, (err, txt): txt = exceptionAsStr(ex) log( SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt)) retryCount -= 1
def retrieveHttpResponseWithRedirectHandleExceptionRetry(url,retryCount=3): while True: try: #log(SEV_LOW, "retrieveHttpResponseWithRedirectHandleExceptionRetry: %s\n" % url) status, reason, responseText = retrieveHttpResponseWithRedirection(url) except socket.error, (err,txt): retryCount -= 1 #txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nsocket error:%d, %s\n" % (url, err, txt)) if retryCount < 0: log(SEV_EXC, "failed to retrieve data for '%s'\ntoo many socket errors\n" % (url)) return None continue # TODO: add handling of urllib2.URLError? # File "C:\Python22\lib\urllib2.py", line 809, in do_open # raise URLError(err) # URLError: <urlopen error (10060, 'Operation timed out')> except Exception, ex: txt = exceptionAsStr(ex) log(SEV_EXC, "failed to retrieve data for '%s'\nreason:%s\n" % (url, txt)) return None
return False if not arsutils.fFileExists(g_thWordsPath): print "Thesaurus dictionary file '%s' doesn't exist" % g_thWordsPath return False try: fo = open(g_thIndexPath, "rb") g_thWordIndex = cPickle.load(fo) fo.close() fo = open(g_thWordsPath, "rb") g_thWords = cPickle.load(fo) fo.close() except Exception, ex: print arsutils.exceptionAsStr(ex) return False print "Finished loading Thesaurus files" return True def initDictionary(): global g_wnFo, g_wnDictPath, g_wnDefReadLock, g_random, g_fInitialized, g_fDisabled global g_thFo, g_thDictPath, g_thDefReadLock if g_fInitialized: return g_fInitialized = True g_fDisabled = True if not loadPickledFiles(): return try:
def _retrieveHours(self, hours, fast): cursor = self._conn.cursor() try: start = time.mktime((self._date.tm_year, self._date.tm_mon, self._date.tm_mday, 0 , 0, 0, -1, -1, -1)) end = start + 3600 * 24 cursor.execute("SELECT date FROM zap2it_cached_data WHERE provider_id = %d AND date >= '%s' AND date < '%s'" % (self._provider, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start)), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end)))) for row in cursor: h = row[0].timetuple()[3] hours.remove(h) finally: cursor.close() # if all data is cached, return None and use _retrieveFromDatabase() instead if 0 == len(hours): return None; opener = None if not self._has_grid: response, opener = _zap2it_retrieve_grid(self._jar, self._zipCode, self._provider) self._has_grid = True response.close() else: opener = opener = _zap2it_opener(self._jar) if not fast: for h in hours: self._events[h] = threading.Event() date = time.localtime(time.mktime((self._date.tm_year, self._date.tm_mon, self._date.tm_mday, h , 0, 0, -1, -1, -1))) _g_zap2it_cache_manager._addActiveRetriever(_zap2it_retriever_key(self._provider, date), self) rows = 0 duration = 1 if fast: rows = 20 duration = 3 hours = [hours[0]] out = [] formData = { "displayType": "Text", "duration": str(duration), "startDay": time.strftime("%m/%d/%Y", self._date), "category": "0", "station": "0", "rowdisplay": str(rows), "goButton": "GO" } stations = {} programs = {} for h in hours: formData["startTime"] = str(h) date = time.localtime(time.mktime((self._date.tm_year, self._date.tm_mon, self._date.tm_mday, h , 0, 0, -1, -1, -1))) encData = urllib.urlencode(formData) request = urllib2.Request(_g_zap2it_listings_url, encData) request.add_header("Referer", _g_zap2it_grid_url) response = opener.open(request) htmlText = None try: _zap2it_tracker(self._jar) contentLength = long(response.info()["Content-Length"]) htmlText = response.read(contentLength) if fast: out.extend(_zap2it_parse_listings(None, htmlText, date, self._provider, stations, programs)) else: _zap2it_parse_listings(self._conn, htmlText, date, self._provider, stations, programs) except Exception, ex: # todo: log exception print exceptionAsStr(ex) f = file(time.strftime('tvlistings-%Y%m%dT%H%M%S.html'), 'wb') f.write(htmlText) f.close() pass response.close() if not fast: self._events[h].set() _g_zap2it_cache_manager._removeActiveRetriever(_zap2it_retriever_key(self._provider, date))
def iterWikipediaArticles(sqlFileName, limit=None, fUseCache=False, fRecreateCache=False): # if limit: # assert fUseCache==False print "fUseCache %d, fRecreateCache=%d" % (fUseCache, fRecreateCache) cacheWriter = None fReallyUseCache = False if fRecreateCache: cacheWriter = ArticleCacheWriter(sqlFileName) cacheWriter.open() else: if fUseCache and fCacheExists(sqlFileName): fReallyUseCache = True else: cacheWriter = ArticleCacheWriter(sqlFileName) cacheWriter.open() lang = os.path.basename(sqlFileName)[:2] print "database dump language: ", lang isUtf8 = False if lang in wikiToDbConvert.g_utf8Languages: isUtf8 = True if isUtf8: print "performing UTF-8 to Latin-1 conversion" if fReallyUseCache: fileName = getIdxFileName(sqlFileName) print "getting articles from cache %s" % fileName fo = open(fileName, "rb") count = 0 while True: title = fo.readline() if len(title) == 0: break if fIsRedirectLine(title): redirect = fo.readline() title = title.strip() if title == REDIRECT_MARK: # need this to remove stupid redirecto of 0xa0=>Space_(punctuation) print "title after stripping is equal to '%s' (REDIRECT_MARK), so skipping" % REDIRECT_MARK continue title = title[len(REDIRECT_MARK) :] if len(title) == 0: print "title after stripping is empty string, so skipping '%s'" % redirect continue article = WikipediaArticleRedirect(title, redirect.strip()) else: title = title.strip() line = fo.readline() if len(title) == 0: print "title after stripping is empty string, so skipping '%s'" % line.strip() continue lineParts = line.split(",") try: ns = int(lineParts[0]) assert ns == NS_MAIN txtOffset = int(lineParts[1]) txtLen = int(lineParts[2]) md5Hash = lineParts[3] viewCount = int(lineParts[4]) article = WikipediaArticleFromCache(sqlFileName, title, ns, txtOffset, txtLen, md5Hash, viewCount) except ValueError, ex: # in en 2004-09-17 db we have an error in ns = int(lineParts[0]), so just ignore it print "exception in iterWikipediaArticles" print arsutils.exceptionAsStr(ex) print "title:_%s_" % title print "line:_%s_" % line print "lineParts[0]=%s" % lineParts[0] continue yield article count += 1 if limit and count > limit: break fo.close() return
def iterWikipediaArticles(sqlFileName, limit=None, fUseCache=False, fRecreateCache=False): #if limit: # assert fUseCache==False print "fUseCache %d, fRecreateCache=%d" % (fUseCache, fRecreateCache) cacheWriter = None fReallyUseCache = False if fRecreateCache: cacheWriter = ArticleCacheWriter(sqlFileName) cacheWriter.open() else: if fUseCache and fCacheExists(sqlFileName): fReallyUseCache = True else: cacheWriter = ArticleCacheWriter(sqlFileName) cacheWriter.open() lang = os.path.basename(sqlFileName)[:2] print "database dump language: ", lang isUtf8 = False if lang in wikiToDbConvert.g_utf8Languages: isUtf8 = True if isUtf8: print "performing UTF-8 to Latin-1 conversion" if fReallyUseCache: fileName = getIdxFileName(sqlFileName) print "getting articles from cache %s" % fileName fo = open(fileName, "rb") count = 0 while True: title = fo.readline() if len(title) == 0: break if fIsRedirectLine(title): redirect = fo.readline() title = title.strip() if title == REDIRECT_MARK: #need this to remove stupid redirecto of 0xa0=>Space_(punctuation) print "title after stripping is equal to '%s' (REDIRECT_MARK), so skipping" % REDIRECT_MARK continue title = title[len(REDIRECT_MARK):] if len(title) == 0: print "title after stripping is empty string, so skipping '%s'" % redirect continue article = WikipediaArticleRedirect(title, redirect.strip()) else: title = title.strip() line = fo.readline() if len(title) == 0: print "title after stripping is empty string, so skipping '%s'" % line.strip( ) continue lineParts = line.split(",") try: ns = int(lineParts[0]) assert ns == NS_MAIN txtOffset = int(lineParts[1]) txtLen = int(lineParts[2]) md5Hash = lineParts[3] viewCount = int(lineParts[4]) article = WikipediaArticleFromCache( sqlFileName, title, ns, txtOffset, txtLen, md5Hash, viewCount) except ValueError, ex: # in en 2004-09-17 db we have an error in ns = int(lineParts[0]), so just ignore it print "exception in iterWikipediaArticles" print arsutils.exceptionAsStr(ex) print "title:_%s_" % title print "line:_%s_" % line print "lineParts[0]=%s" % lineParts[0] continue yield article count += 1 if limit and count > limit: break fo.close() return
def convertArticle(term, text): try: text = text.replace("__NOTOC__", "") text = fixSup2(text) text = removeImageRx(text) # remove categories. TODO: provide a better support for categories # i.e. we remember categories on the server and client can display # all articles in a given category # text=replaceRegExp(text, categoryRe, '') text = replaceWikiMacros(text) # remove remaining templates. TODO: better support for templates # in wikipedia template text is replaced by a page from Template: # namespace text = replaceRegExp(text, wikiTemplateRe, "") text = text.replace("\r", "") text = replaceRegExp(text, commentRe, "") # This should be safe, as it's illegal in html to nest comments text = stripTagBlocks(text, "div") text = stripTagBlocks(text, "table") text = stripBlocks(text, r"\{\|", r"\|\}") text = replaceRegExp(text, scriptRe, "") text = replaceTagList(text, ["b", "strong"], "'''") text = replaceTagList(text, ["em", "i", "cite"], "''") text = replaceTagList(text, ["hr"], "----") text = replaceTagList(text, ["p"], "<br>") text = replaceTagList( text, [ "dfn", "code", "samp", "kbd", "var", "abbr", "acronym", "blockquote", "q", "pre", "ins", "del", "dir", "menu", "img", "object", "big", "span", "applet", "font", "basefont", "tr", "td", "table", "center", "div", ], "", ) text = replaceRegExp(text, badLinkRe, "", supportedLanguagesRe()) text = entities.convertNamedEntities(term, text) text = entities.convertNumberedEntities(term, text) text = stripMultipleNewLines(text) text = text.strip() text += "\n" return text except Exception, ex: print "Exception while converting term: ", term print arsutils.exceptionAsStr(ex) return ""
def spider_last_modified(): return os.path.getmtime(_g_spider_data_path) def reindex(index, data = None): data = _load_spider_data() for letter, letter_data in data.iteritems(): print "Indexing %d books for letter '%s'." % (letter_data[0], letter) for book in letter_data[1]: url, title, subtitle, author, book_id, code, formats = book title = _decode(title) subtitle = _decode(subtitle) author = _transform_author(_decode(author)) book_id = _decode(book_id) code = _decode(code) formats.sort() index.index_ebook(title, subtitle, author, book_id, formats, PROVIDER_ID, code) try: import psyco psyco.bind(_Spider._merge_temps) psyco.bind(_Spider._spider_letter_range) except Exception, ex: print exceptionAsStr(ex) except ImportError: print "psyco not available. You should consider using it (http://psyco.sourceforge.net/)" if __name__ == "__main__": spider()