Ejemplo n.º 1
0
	def scanPage(self, url, depth):
		req = urllib2.Request(url)
		webutils.setupRequest(req)
		response = self._opener.open(req)
		if response == None:
			raise StopIteration()
		try:
			html = response.read()
		except:
			raise StopIteration()

		links = self._reexp.findall(html)
		linkRec = set()
		for link in links:
			if re.search(r'^javascript:', link):
				continue
			link = self.adjustUrl(url, link)
			if not link in self._linkList and not link in linkRec:
				if link.find(self._scope) != -1:
					linkRec.add(link)
					yield link
		self._linkList = self._linkList.union(linkRec)
		if self._maxCount >= 0 and len(self._linkList) >= self._maxCount:
		  	raise StopIteration()

		depth -= 1
		if depth <= 0:
			raise StopIteration()

		for link in linkRec:				
			for link2 in self.scanPage(link, depth):
				yield link2
Ejemplo n.º 2
0
def _refreshCookie(opener, what):

    what = urllib2.quote(what)
    url = GFSOSO_HOME + '?q=%s' % (what)
    req = urllib2.Request(url)
    webutils.setupRequest(req)
    req.add_header('Referer', GFSOSO_HOME)
    try:
        response = opener.open(req, timeout=REQ_TIMEOUT)
        # print response.geturl()
        if response.geturl().find(GFSOSO_HOME) == -1:
            global RedirectedUrl
            RedirectedUrl = response.geturl()
            RedirectedUrl = RedirectedUrl[0:RedirectedUrl.find('/', 7) + 1]
            # print 'Redirect', RedirectedUrl
            return False

        html = response.read()
    except Exception, e:
        print e
        html = ''
        if e.code == 301:  # moved
            # html = reduce(lambda x,y: x + y, e.readlines())
            for line in e.readlines():
                html += line
        else:
            print "Exception: url: %s - " % url, e
            return False
Ejemplo n.º 3
0
def _refreshCookie(opener, what):

	what = urllib2.quote(what)
	url = GFSOSO_HOME + '?q=%s' % (what)
	req = urllib2.Request(url)
	webutils.setupRequest(req)
	req.add_header('Referer', GFSOSO_HOME)
	try:
		response = opener.open(req, timeout = REQ_TIMEOUT)
		# print response.geturl()
		if response.geturl().find(GFSOSO_HOME) == -1:
			global RedirectedUrl
			RedirectedUrl = response.geturl()
			RedirectedUrl = RedirectedUrl[0 : RedirectedUrl.find('/', 7) + 1]
			# print 'Redirect', RedirectedUrl
			return False

		html = response.read()
	except Exception, e:
		print e
		html = ''
		if e.code == 301: # moved
			# html = reduce(lambda x,y: x + y, e.readlines())
			for line in e.readlines():
				html += line
		else:
			print "Exception: url: %s - " % url, e
			return False
Ejemplo n.º 4
0
def _gfsosoPageHandler(opener, url):
    req = urllib2.Request(url)
    webutils.setupRequest(req)
    req.add_header('Referer', url[:-4])

    try:
        response = opener.open(req, timeout=REQ_TIMEOUT)
        html = response.read()
        #print html
    except Exception, e:
        print "Exception: url: %s - " % url, e
        raise StopIteration()
Ejemplo n.º 5
0
def _gfsosoPageHandler(opener, url):
	req = urllib2.Request(url)
	webutils.setupRequest(req)
	req.add_header('Referer', url[:-4])

	try:
		response = opener.open(req, timeout = REQ_TIMEOUT)
		html = response.read()
		#print html
	except Exception, e:
		print "Exception: url: %s - " % url, e
		raise StopIteration()
Ejemplo n.º 6
0
    def _pageHandler(self, url):
        # print 'page handler'
        req = urllib2.Request(url)
        webutils.setupRequest(req)
        req.add_header('Referer', url[:-4])

        try:
            response = self._opener.open(req, timeout=self.reqTimeout)
            html = response.read()
            # print html
        except Exception, e:
            print "Exception: url: %s - " % url, e
            raise StopIteration()
Ejemplo n.º 7
0
def _bingSearchPageHandler(opener, url):
    #print url
    #response = opener.open(url, data = None, timeout = 10)
    req = urllib2.Request(url)
    webutils.setupRequest(req)
    req.add_header('Proxy-Connection', 'Keep-Alive')

    try:
        response = opener.open(req, timeout=REQ_TIMEOUT)
        html = response.read()
        #print html
    except Exception, e:
        print "Exception: url: %s - " % url, e
        raise StopIteration()
Ejemplo n.º 8
0
def _bingSearchPageHandler(opener, url):
	#print url
	#response = opener.open(url, data = None, timeout = 10)
	req = urllib2.Request(url)
	webutils.setupRequest(req) 
	req.add_header('Proxy-Connection', 'Keep-Alive')

	try:
		response = opener.open(req, timeout = REQ_TIMEOUT)
		html = response.read()
		#print html
	except Exception, e:
		print "Exception: url: %s - " % url, e
		raise StopIteration()