Exemple #1
0
class Fetcher:
    def __init__(self,item,debug_level=0,soc_timeout=10,log=None,name="<unknown>"):
        self._name = name
        self.log= LogAdapter(log=log)
        self._client = HttpClient(log,debug_level=debug_level,req_timeout=soc_timeout)
        self._client.timeout=11
        self._html = ""
        self.result_handlers = []
        self.exception_handler = []
        self.result = []

    def Fetch(self):
        try:
            self._fetch()
        except Exception as e:
            self.log.exception(e)
            self.handle_exception()
        self.handle_exception()

    def add_handler(self,handler=None,exception_handler=None):
        if exception_handler:
            self.exception_handlers.append(exception_handler)
        if handler:
            self.result_handlers.append(handler)

    def handle_result(self):
        for handler in self.result_handlers:
            handler.process()

    def handle_exception(self):
        for handler in self.exception_handler:
            handler.process()
Exemple #2
0
 def __init__(self, interval=1.0, accuracy=0.01,function=None, log=None, args=(), kwargs={}):
     BaseThread.__init__(self,log)
     self.log = LogAdapter(log)
     self._interval = interval
     self._accuracy = accuracy
     self._func = function
     self._args = args
     self._kwargs = kwargs
Exemple #3
0
 def __init__(self,item,debug_level=0,soc_timeout=10,log=None,name="<unknown>"):
     self._name = name
     self.log= LogAdapter(log=log)
     self._client = HttpClient(log,debug_level=debug_level,req_timeout=soc_timeout)
     self._client.timeout=11
     self._html = ""
     self.result_handlers = []
     self.exception_handler = []
     self.result = []
Exemple #4
0
 def init(self,period=None, group="", begin_time=None,
          end_time=None, log=None, name="<?work>", *args, **kwargs):
     self.log          = LogAdapter(log)
     self._group_id    = group
     self._begin_time  = begin_time
     self._end_time    = end_time
     self._period      = period
     self._args        = args
     self._kwargs      = kwargs
     self.setName(name)
Exemple #5
0
 def __init__(self, item_id, track_type,log=None):
     self.log = LogAdapter(log)
     self.item_id = item_id
     self.track_type = track_type
     self.name = "FetcherBase"
     self.proxy_dict = ""
     self.track_dict={}
     self.http = HttpClient()
     self.http.req_timeout = 30
     self.conn = None
     self.db = None
     self.debug_level = 0
     self.initialised = False
Exemple #6
0
 def __init__(self, period=None, group="", begin_time=None,
              end_time=None, log=None, name="<?work>", *args, **kwargs ):
     OBase.__init__(self,name=name)
     self.log          = LogAdapter(log)
     #self._work_name   = name
     self._group_id    = group
     self._done        = False
     self._redoable    = False
     self._begin_time  = begin_time
     self._end_time    = end_time
     self._period      = period
     self._db_conn     = None
     self._args        = args
     self._kwargs      = kwargs
     self._serial_id_  = None  # specify id for serial works,
Exemple #7
0
 def __init__(self,log=None,debug_level=0,req_timeout=30):
     self.__log = LogAdapter(log)
     self.__content = None
     self.__cookie = None
     self.__cookie_str = ""
     self.__req_timeout = req_timeout
     self.__httpDebugLevel = debug_level
     self.__headers_dic = {'Referer':"http://www.google.com/"}
     self.__cookie_enabled = False
     self.__proxy_enable = False
     self.__proxy_dic = None
     self._opener = None
     self.__is_busy = False
     self.__buffer_size = 1024*100
     self.SetDebugLevel(debug_level)
     self.header_refer_ = "http://www.google.com/"
     self.header_user_agent_ = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
Exemple #8
0
class LoopTimer(BaseThread):
    """Call a function after a specified number of seconds:

    t = Looper(30.0, f, args=[], kwargs={})
    t.start()
    t.cancel() # stop the timer's action if it's still waiting
    """

    def __init__(self, interval=1.0, accuracy=0.01,function=None, log=None, args=(), kwargs={}):
        BaseThread.__init__(self,log)
        self.log = LogAdapter(log)
        self._interval = interval
        self._accuracy = accuracy
        self._func = function
        self._args = args
        self._kwargs = kwargs

    def do(self):
        self.is_active.clear()
        is_first_run=True
        start=0.0
        now=0.0
        tmp=0.0
        while self.IsActive:
            now = time.clock()
            interval = now - start
            assert(interval>0)
            if start == 0.0 or interval > self._interval:
                if tmp < interval: tmp = interval
                self.log.debug("LoopTimer current interval = %f (%f)"%(interval,tmp))
                start = now
                if self._func:
                    self.log.debug("LoopTimer.func started")
                    self._func()
                    self.log.debug("LoopTimer.func ended")
                else:
                    self.log.warn("LoopTimer has no target to run")
            else:
                self.is_active.wait(timeout=self._accuracy)

        self.is_active.set()
Exemple #9
0
class _db_conn_helper:
    def __init__(self,conn=None,log=None):
        self.log = LogAdapter(log)
        self._conn = conn
        self._cursor = self._conn.cursor()

    def _handle_exception(self,e):
        """this error means that db conn or db internal
        err happened in the process of a transaction which
        can not be reconnect,try to reopen it manually
        """
        self.log.exception(e)
        if self._conn._transaction:
            self.log.error("db conn is dead,1111111111111.")
            self._conn._reset(True)
            self._cursor = self._conn.cursor()
        if not self._conn._ping_check():
            self.log.error("db conn is dead,reconnect failed.")
Exemple #10
0
 def __init__(self,conn=None,log=None):
     self.log = LogAdapter(log)
     self._conn = conn
     self._cursor = self._conn.cursor()
Exemple #11
0
class FetcherBase:
    def __init__(self, item_id, track_type,log=None):
        self.log = LogAdapter(log)
        self.item_id = item_id
        self.track_type = track_type
        self.name = "FetcherBase"
        self.proxy_dict = ""
        self.track_dict={}
        self.http = HttpClient()
        self.http.req_timeout = 30
        self.conn = None
        self.db = None
        self.debug_level = 0
        self.initialised = False

    def SetProxy(self,proxy_dict):
        self.proxy_dict = proxy_dict
        if proxy_dict is not None and len(proxy_dict) > 0:
            self.http.AddProxy(self.proxy_dict)

    def SetDBConn(self,conn):
        self.conn = conn
        self.db = track_db.TrackDB(db_conn=self.conn, log=self.log)

    def Fetch(self):
        try:
            self._fetch()
        except Exception as e:
            self._error_handle(e)

    def _chk_new_items(self,item_list):
        try:
            counter = len(item_list)
            if counter == 0:
                self.log.debug("[%s] Item info chk over with result is 0.", self.item_id )
                return False
            # got item info
            if not self.initialised:
                self.track_dict = self.db.item_get_top_n(self.item_id,counter,self.track_type)
                self.initialised = True
            for item_info in item_list:
                if item_info not in self.track_dict:
                    self.track_dict[item_info]=item_info #???
                    result = self._new_data(item_info)
                    if result == -1:
                        self.log.warn("[%s] Item info already in db, track_time=%s"
                                      ,self.name, str(item_info.track_time)  )
                    elif result == 2:
                        self.log.info("[%s] Tracking ended for track_time=%s"
                                      ,self.name, str(item_info.track_time) )
                        #break
            self.log.debug("[%s] Item info chk over.", self.item_id )
            return True
        except Exception as e:
            self._error_handle(str(e))
            return False

    def _new_data(self,item):
        """
        result_     1   new data stored
                    2   tracking ended
                   -1   already in db
        """
        self.log.info("[%s] New track info coming:[type=%s|delivered=%d] ITEM=%s,%s,%s,%s",
                      self.name, self.track_type, item.is_ended, item.name,
                       item.description, item.location, item.track_time )
        if self.db:
            return self.db.sp_insert_new_item(self.track_type, item.is_ended,
                item.name, item.track_time, item.description, item.location)
        else:
            raise Exception("db did not initialised")

    def _error_handle(self, msg):
        if msg:
            self.log.error("[%s] error happend:%s", self.item_id, msg )
        if self.db:
            return self.db.sp_update_item_status(self.track_type,self.item_id)
        else:
            self.log.error("[%s] error happend: db access error at meantime...",self.item_id)
        if issubclass(msg,BaseException):
            self.log.exception(msg)

    def _dump_error(self,item_id,fetch_url,text="",e=None):
        SAVEPATH=r'ex-pages'
        """ save exception page"""
        from time import localtime,time
        import codecs,os
        if text is None or text == "":
            return
        t=localtime(time())
        t_str="%d%d%d%d%d%d"%(t.tm_year,t.tm_mon , t.tm_mday,
                              t.tm_hour, t.tm_min, t.tm_sec)
        filename=r'./%s/%s_%s.html'%(self.SAVEPATH,item_id,t_str)
        vavava.util.assure_path("./%s"%self.SAVEPATH)
        f=codecs.open(filename,"w",'utf-8')
        file_full_name=os.path.abspath(filename)
        if f:
            f.writelines('<!--' + fetch_url + '--!>'+os.linesep)
            f.write(text)
            f.close()
        self.log.error(r"page saved at %s",file_full_name)
Exemple #12
0
class HttpClient(object):
    """ a simple client of http"""
    def __init__(self,log=None,debug_level=0,req_timeout=30):
        self.__log = LogAdapter(log)
        self.__content = None
        self.__cookie = None
        self.__cookie_str = ""
        self.__req_timeout = req_timeout
        self.__httpDebugLevel = debug_level
        self.__headers_dic = {'Referer':"http://www.google.com/"}
        self.__cookie_enabled = False
        self.__proxy_enable = False
        self.__proxy_dic = None
        self._opener = None
        self.__is_busy = False
        self.__buffer_size = 1024*100
        self.SetDebugLevel(debug_level)
        self.header_refer_ = "http://www.google.com/"
        self.header_user_agent_ = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    def Get(self,url,download_callback=None):
        if self._opener is None:
            self.__install_opener()
        self.__init_header(url)
        socket.setdefaulttimeout(self.__req_timeout)
        req = urllib2.Request(url,headers=self.__headers_dic)
        resp = self._opener.open(req,timeout=self.__req_timeout)
        if url != resp.url:
            self.__log.debug("%s redirect to :%s", url, resp.url)
        self.__content = resp.read()
        return self.__content
    def TryGet(self,url,download_callback=None, retry=3):
        try:
            return self.Get(url,download_callback)
        except Exception as e:
            if retry > 0:
                retry -= 1
            else:
                raise
    def Post(self,url,post_dic):
        if self._opener is None:
            self.__install_opener()
        postdata=urllib.urlencode(post_dic).encode('gb2312')
        self.__init_header(url)
        socket.setdefaulttimeout(self.__req_timeout)
        req = urllib2.Request(url,data=postdata,headers=self.__headers_dic)
        resp = self._opener.open(req)
        self.__content = resp.read(self.__buffer_size)
        return self.__content

    def GetData(self, url, fp, duration=None, buffer_size=1024*1024):
        if duration:
            stop_time = time.clock() + float(duration)
        if self._opener is None:
            self.__install_opener()
        self.__init_header(url)
        socket.setdefaulttimeout(self.__req_timeout)
        req = urllib2.Request(url,headers=self.__headers_dic)
        resp = self._opener.open(req,timeout=self.__req_timeout)
        if url != resp.url:
            self.__log.debug("%s redirect to :%s", url, resp.url)
        data = resp.read(buffer_size)
        while data:
            fp.write(data)
            if duration and stop_time < time.clock():
                return
            else:
                data = resp.read(buffer_size)

    def EnableCookieSupport(self,enable=True):
        if enable and self.__cookie is None:
            self.__cookie = LWPCookieJar()
        else:
            self.__cookie = None
        self.__cookie_enabled = enable
        self.__install_opener()
    def AddHeader(self,kw={}):
        for k in kw:
            self.__headers_dic[k] = kw[k]
    def AddProxy(self,proxy_pair):
        self.__proxy_dic = proxy_pair
        self.__proxy_enable = True
        self.__install_opener()
    def SetDebugLevel(self,level=0):
        from httplib import HTTPConnection
        HTTPConnection.debuglevel = level
        self.__httpDebugLevel=level
    def __install_opener(self):
        if self._opener is None:
            self._opener = urllib2.build_opener(
                ContentEncodingProcessor() ) # always support zlib
        if self.__cookie_enabled:
            self._opener.add_handler(
                urllib2.HTTPCookieProcessor(self.__cookie) )
        if self.__proxy_enable:
            self._opener.add_handler(
                urllib2.ProxyHandler(self.__proxy_dic) )
        urllib2.install_opener(self._opener)
    def __init_header(self,url):
        #self.__headers_dic = {'Referer':url}
        if self.header_user_agent_  is not None:
            self.__headers_dic['Referer'] = self.header_refer_
        if self.header_user_agent_ is not None:
            self.__headers_dic['User-Agent'] = self.header_user_agent_
        if False and self.__cookie_enabled:
            self.__cookie_str=""
            for s in self.__cookie:
                self.__cookie_str += ";" + s
            if self.__cookie_str.strip() != "":
                self.__headers_dic['Set-Cookie'] = self.__cookie_str
        return self.__headers_dic
Exemple #13
0
 def __init__(self, log=None):
     self.filters = []
     self.log = LogAdapter(log)
     self.result_data_type = None
     self.datas = []
Exemple #14
0
class Fetcher(object):
    def __init__(self, log=None):
        self.filters = []
        self.log = LogAdapter(log)
        self.result_data_type = None
        self.datas = []

    def execute(self):
        for filter in self.filters:
            if filter[0] == 1:
                self.filter_get(filter[1], filter[2:])
            elif filter[0] == 2:
                self.filter_process(filter[1])
            elif filter[0] == 3:
                self.filter_result(filter[1:])
            elif filter[0] == 4:
                self.filter_result_db(filter[1], filter[2])

    def filter_get(self, charset="utf8", urls=[]):
        if len(urls) == 0:
            self.log.warn("no income resource")
        htmls = []
        for url in urls:
            try:
                from vavava.httpclient import HttpClient

                client = HttpClient(log=None, debug_level=0, req_timeout=30)
                data = client.Get(url)
                if data:
                    htmls.append(data.decode(charset))
                else:
                    self.log.debug(url)
            except Exception as e:
                self.log.LOG.exception(url, e)
        self.datas = htmls

    def filter_process(self, reg_str=""):
        result = []
        for data in self.datas:
            try:
                matches = reg_helper(data, reg_str)
                for match in matches:
                    result.append(match)
            except Exception as e:
                self.log.exception(e)
        self.datas = result

    def filter_result(self, keys=[]):
        class result_data:
            def __init__(self, values=[]):
                self.values = values

            def _key(self):
                key = ""
                for i in keys:
                    key += self.values[i]
                return key

            def __lt__(self, other):
                return self._key() < other._key()

            def __hash__(self):
                return hasattr(self._key())

        results = []
        for i in range(len(self.datas)):
            results.append(result_data(self.datas[i]))
        self.datas = results

    def filter_result_db(self, conn, table, cols, values_format, types):
        if not (conn and table and cols and values_format):
            return
        sql = """
          insert into %s(%s)
          values(%s)
        """
        sql1 = sql % (table, cols, values_format)
        tmp = []
        for result in self.results:
            for i in range(len(types)):
                tmp.append(self.data(types[i], result[i]))
            sql2 = sql1 % tmp
            cursor = conn.cursor()
            cursor.execute(sql2)
            conn.commit()

    def data(self, t, data):
        if t == "string":
            return data
        elif t == "int":
            return int(data)
        elif t == "datetime":
            import time

            return time.strptime(data, "%d/%m/%y %H:%M")
Exemple #15
0
class Work(OBase):
    """
    base class for user-work-item,self.do() must be written over
    For priority work: you need to overwrite __lt__ and __hash__
    """
    def __init__(self, period=None, group="", begin_time=None,
                 end_time=None, log=None, name="<?work>", *args, **kwargs ):
        OBase.__init__(self,name=name)
        self.log          = LogAdapter(log)
        #self._work_name   = name
        self._group_id    = group
        self._done        = False
        self._redoable    = False
        self._begin_time  = begin_time
        self._end_time    = end_time
        self._period      = period
        self._db_conn     = None
        self._args        = args
        self._kwargs      = kwargs
        self._serial_id_  = None  # specify id for serial works,
                                  # in order to run these works in the same thread

    def init(self,period=None, group="", begin_time=None,
             end_time=None, log=None, name="<?work>", *args, **kwargs):
        self.log          = LogAdapter(log)
        self._group_id    = group
        self._begin_time  = begin_time
        self._end_time    = end_time
        self._period      = period
        self._args        = args
        self._kwargs      = kwargs
        self.setName(name)

    def do(self,worker=None):
        pass

    def _do(self,worker=None):
        try:
            self._done=True
            self.do(worker)
        except Exception as e:
            self.log.error("delete work:group=%s,name=%s,%s",self._group_id,self._name,e)
            self.log.exception(e)
        except:
            pass

    def add_work_to_parent(self,work):
        if self._wq_parent:
            self._wq_parent.QueueWork(work)
        else:
            raise Exception("GeneratorWork has no parent handle")

    def get_instance(self,xml):
        pass

    def __lt__(self, other):
        if self._begin_time and other._begin_time:
            return self._begin_time < other._begin_time
        elif self._begin_time:
            return False
        elif other._begin_time:
            return True

    def __hash__(self):
        return hash(self._begin_time)