class LoopTimer(BaseThread): """Call a function after a specified number of seconds: t = Looper(30.0, f, args=[], kwargs={}) t.start() t.cancel() # stop the timer's action if it's still waiting """ def __init__(self, interval=1.0, accuracy=0.01,function=None, log=None, args=(), kwargs={}): BaseThread.__init__(self,log) self.log = LogAdapter(log) self._interval = interval self._accuracy = accuracy self._func = function self._args = args self._kwargs = kwargs def do(self): self.is_active.clear() is_first_run=True start=0.0 now=0.0 tmp=0.0 while self.IsActive: now = time.clock() interval = now - start assert(interval>0) if start == 0.0 or interval > self._interval: if tmp < interval: tmp = interval self.log.debug("LoopTimer current interval = %f (%f)"%(interval,tmp)) start = now if self._func: self.log.debug("LoopTimer.func started") self._func() self.log.debug("LoopTimer.func ended") else: self.log.warn("LoopTimer has no target to run") else: self.is_active.wait(timeout=self._accuracy) self.is_active.set()
class FetcherBase: def __init__(self, item_id, track_type,log=None): self.log = LogAdapter(log) self.item_id = item_id self.track_type = track_type self.name = "FetcherBase" self.proxy_dict = "" self.track_dict={} self.http = HttpClient() self.http.req_timeout = 30 self.conn = None self.db = None self.debug_level = 0 self.initialised = False def SetProxy(self,proxy_dict): self.proxy_dict = proxy_dict if proxy_dict is not None and len(proxy_dict) > 0: self.http.AddProxy(self.proxy_dict) def SetDBConn(self,conn): self.conn = conn self.db = track_db.TrackDB(db_conn=self.conn, log=self.log) def Fetch(self): try: self._fetch() except Exception as e: self._error_handle(e) def _chk_new_items(self,item_list): try: counter = len(item_list) if counter == 0: self.log.debug("[%s] Item info chk over with result is 0.", self.item_id ) return False # got item info if not self.initialised: self.track_dict = self.db.item_get_top_n(self.item_id,counter,self.track_type) self.initialised = True for item_info in item_list: if item_info not in self.track_dict: self.track_dict[item_info]=item_info #??? result = self._new_data(item_info) if result == -1: self.log.warn("[%s] Item info already in db, track_time=%s" ,self.name, str(item_info.track_time) ) elif result == 2: self.log.info("[%s] Tracking ended for track_time=%s" ,self.name, str(item_info.track_time) ) #break self.log.debug("[%s] Item info chk over.", self.item_id ) return True except Exception as e: self._error_handle(str(e)) return False def _new_data(self,item): """ result_ 1 new data stored 2 tracking ended -1 already in db """ self.log.info("[%s] New track info coming:[type=%s|delivered=%d] ITEM=%s,%s,%s,%s", self.name, self.track_type, item.is_ended, item.name, item.description, item.location, item.track_time ) if self.db: return self.db.sp_insert_new_item(self.track_type, item.is_ended, item.name, item.track_time, item.description, item.location) else: raise Exception("db did not initialised") def _error_handle(self, msg): if msg: self.log.error("[%s] error happend:%s", self.item_id, msg ) if self.db: return self.db.sp_update_item_status(self.track_type,self.item_id) else: self.log.error("[%s] error happend: db access error at meantime...",self.item_id) if issubclass(msg,BaseException): self.log.exception(msg) def _dump_error(self,item_id,fetch_url,text="",e=None): SAVEPATH=r'ex-pages' """ save exception page""" from time import localtime,time import codecs,os if text is None or text == "": return t=localtime(time()) t_str="%d%d%d%d%d%d"%(t.tm_year,t.tm_mon , t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) filename=r'./%s/%s_%s.html'%(self.SAVEPATH,item_id,t_str) vavava.util.assure_path("./%s"%self.SAVEPATH) f=codecs.open(filename,"w",'utf-8') file_full_name=os.path.abspath(filename) if f: f.writelines('<!--' + fetch_url + '--!>'+os.linesep) f.write(text) f.close() self.log.error(r"page saved at %s",file_full_name)
class HttpClient(object): """ a simple client of http""" def __init__(self,log=None,debug_level=0,req_timeout=30): self.__log = LogAdapter(log) self.__content = None self.__cookie = None self.__cookie_str = "" self.__req_timeout = req_timeout self.__httpDebugLevel = debug_level self.__headers_dic = {'Referer':"http://www.google.com/"} self.__cookie_enabled = False self.__proxy_enable = False self.__proxy_dic = None self._opener = None self.__is_busy = False self.__buffer_size = 1024*100 self.SetDebugLevel(debug_level) self.header_refer_ = "http://www.google.com/" self.header_user_agent_ = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' def Get(self,url,download_callback=None): if self._opener is None: self.__install_opener() self.__init_header(url) socket.setdefaulttimeout(self.__req_timeout) req = urllib2.Request(url,headers=self.__headers_dic) resp = self._opener.open(req,timeout=self.__req_timeout) if url != resp.url: self.__log.debug("%s redirect to :%s", url, resp.url) self.__content = resp.read() return self.__content def TryGet(self,url,download_callback=None, retry=3): try: return self.Get(url,download_callback) except Exception as e: if retry > 0: retry -= 1 else: raise def Post(self,url,post_dic): if self._opener is None: self.__install_opener() postdata=urllib.urlencode(post_dic).encode('gb2312') self.__init_header(url) socket.setdefaulttimeout(self.__req_timeout) req = urllib2.Request(url,data=postdata,headers=self.__headers_dic) resp = self._opener.open(req) self.__content = resp.read(self.__buffer_size) return self.__content def GetData(self, url, fp, duration=None, buffer_size=1024*1024): if duration: stop_time = time.clock() + float(duration) if self._opener is None: self.__install_opener() self.__init_header(url) socket.setdefaulttimeout(self.__req_timeout) req = urllib2.Request(url,headers=self.__headers_dic) resp = self._opener.open(req,timeout=self.__req_timeout) if url != resp.url: self.__log.debug("%s redirect to :%s", url, resp.url) data = resp.read(buffer_size) while data: fp.write(data) if duration and stop_time < time.clock(): return else: data = resp.read(buffer_size) def EnableCookieSupport(self,enable=True): if enable and self.__cookie is None: self.__cookie = LWPCookieJar() else: self.__cookie = None self.__cookie_enabled = enable self.__install_opener() def AddHeader(self,kw={}): for k in kw: self.__headers_dic[k] = kw[k] def AddProxy(self,proxy_pair): self.__proxy_dic = proxy_pair self.__proxy_enable = True self.__install_opener() def SetDebugLevel(self,level=0): from httplib import HTTPConnection HTTPConnection.debuglevel = level self.__httpDebugLevel=level def __install_opener(self): if self._opener is None: self._opener = urllib2.build_opener( ContentEncodingProcessor() ) # always support zlib if self.__cookie_enabled: self._opener.add_handler( urllib2.HTTPCookieProcessor(self.__cookie) ) if self.__proxy_enable: self._opener.add_handler( urllib2.ProxyHandler(self.__proxy_dic) ) urllib2.install_opener(self._opener) def __init_header(self,url): #self.__headers_dic = {'Referer':url} if self.header_user_agent_ is not None: self.__headers_dic['Referer'] = self.header_refer_ if self.header_user_agent_ is not None: self.__headers_dic['User-Agent'] = self.header_user_agent_ if False and self.__cookie_enabled: self.__cookie_str="" for s in self.__cookie: self.__cookie_str += ";" + s if self.__cookie_str.strip() != "": self.__headers_dic['Set-Cookie'] = self.__cookie_str return self.__headers_dic
class Fetcher(object): def __init__(self, log=None): self.filters = [] self.log = LogAdapter(log) self.result_data_type = None self.datas = [] def execute(self): for filter in self.filters: if filter[0] == 1: self.filter_get(filter[1], filter[2:]) elif filter[0] == 2: self.filter_process(filter[1]) elif filter[0] == 3: self.filter_result(filter[1:]) elif filter[0] == 4: self.filter_result_db(filter[1], filter[2]) def filter_get(self, charset="utf8", urls=[]): if len(urls) == 0: self.log.warn("no income resource") htmls = [] for url in urls: try: from vavava.httpclient import HttpClient client = HttpClient(log=None, debug_level=0, req_timeout=30) data = client.Get(url) if data: htmls.append(data.decode(charset)) else: self.log.debug(url) except Exception as e: self.log.LOG.exception(url, e) self.datas = htmls def filter_process(self, reg_str=""): result = [] for data in self.datas: try: matches = reg_helper(data, reg_str) for match in matches: result.append(match) except Exception as e: self.log.exception(e) self.datas = result def filter_result(self, keys=[]): class result_data: def __init__(self, values=[]): self.values = values def _key(self): key = "" for i in keys: key += self.values[i] return key def __lt__(self, other): return self._key() < other._key() def __hash__(self): return hasattr(self._key()) results = [] for i in range(len(self.datas)): results.append(result_data(self.datas[i])) self.datas = results def filter_result_db(self, conn, table, cols, values_format, types): if not (conn and table and cols and values_format): return sql = """ insert into %s(%s) values(%s) """ sql1 = sql % (table, cols, values_format) tmp = [] for result in self.results: for i in range(len(types)): tmp.append(self.data(types[i], result[i])) sql2 = sql1 % tmp cursor = conn.cursor() cursor.execute(sql2) conn.commit() def data(self, t, data): if t == "string": return data elif t == "int": return int(data) elif t == "datetime": import time return time.strptime(data, "%d/%m/%y %H:%M")