Beispiel #1
0
 def __init__(self):
     logging.config.fileConfig("logging.ini")
     self.__logger = logging.getLogger('jsonpost')
     c = configuration.configuration()
     c.fileConfig("configuration.ini")
     self.__RETRY_TIMES__ = int(c.getValue("Runtime", "retry_times"))
     self.__PAGE_INTERVAL__ = int(c.getValue("Runtime", "page_interval"))
     self.__h = htmlparser.htmlpaser()
Beispiel #2
0
    def __init__(self, baseurl):
        self.urlmanager = urlmanager()
        self.htmldownloader = htmldownloader(baseurl=baseurl)
        self.htmlparser = htmlpaser()
        self.dataoutput = dataoutput()

        self.max_iter_time = max_iter_time
        self.sleeptime = sleeptime
Beispiel #3
0
 def __init__(self):
     logging.config.fileConfig("logging.ini")
     self.__logger = logging.getLogger('grab')
     self.__c = configuration.configuration()
     self.__c.fileConfig("configuration.ini")
     self.__RETRY_TIMES__ = int(self.__c.getValue("Runtime", "retry_times"))
     self.__f = formpage.formpage()
     self.__h = htmlparser.htmlpaser()
     self.__j = jsonpost.jsonpost()
     self.__js = jspost.jspost()
     self.__taskID = ""
Beispiel #4
0
    def __init__(self):
        #self.__pageinfo = ""
        # Browser
        self.__br = mechanize.Browser()
        self.__br_i = mechanize.Browser()

        #self.__br = mechanize.urlopen("https://taobao.com")

        # Cookie Jar
        cj = cookielib.LWPCookieJar()
        self.__br.set_cookiejar(cj)
        self.__br_i.set_cookiejar(cj)

        # Browser options
        self.__br.set_handle_equiv(True)
        self.__br.set_handle_gzip(True)
        self.__br.set_handle_redirect(True)
        self.__br.set_handle_referer(True)
        self.__br.set_handle_robots(False)

        # Browser options
        self.__br_i.set_handle_equiv(True)
        self.__br_i.set_handle_gzip(True)
        self.__br_i.set_handle_redirect(True)
        self.__br_i.set_handle_referer(True)
        self.__br_i.set_handle_robots(False)

        # Follows refresh 0 but not hangs on refresh > 0
        self.__br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        # Follows refresh 0 but not hangs on refresh > 0
        self.__br_i.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        # Want debugging messages?
        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        # User-Agent (this is cheating, ok?)
        self.__br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
        self.__br_i.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

        logging.config.fileConfig("logging.ini")
        self.__logger = logging.getLogger('formpage')
        c = configuration.configuration()
        c.fileConfig("configuration.ini")
        self.__RETRY_TIMES__ = int(c.getValue("Runtime","retry_times"))
        self.__PAGE_INTERVAL__ = int(c.getValue("Runtime","page_interval"))

        self.__h = htmlparser.htmlpaser()