def load_cookies_from_file(path: Path) -> Union[MozillaCookieJar, None]: if path.is_file(): jar = MozillaCookieJar(path) jar.load(ignore_discard=True) return jar else: return None
def __init__(self, mobile, password=None, status='0', cachefile='Fetion.cache', cookiesfile=''): '''登录状态: 在线:400 隐身:0 忙碌:600 离开:100 ''' if cachefile: self.cache = Cache(cachefile) if not cookiesfile: cookiesfile = '%s.cookies' % mobile cookiejar = MozillaCookieJar(filename=cookiesfile) if not os.path.isfile(cookiesfile): open(cookiesfile, 'w').write(MozillaCookieJar.header) cookiejar.load(filename=cookiesfile) cookie_processor = HTTPCookieProcessor(cookiejar) self.opener = build_opener(cookie_processor, HTTPHandler) self.mobile, self.password = mobile, password if not self.alive(): self._login() cookiejar.save() self.changestatus(status)
def load_cookies(self, path=None): # Note: to load cookies for a domain, you must # first navigate to that domain. self.logger.info('Loading cookies') if path is None: path = self.cookies_file if path.endswith('.json'): try: with open(self.cookies_file, 'r') as f: cookies = json.load(f) except FileNotFoundError: return False for c in cookies: if isinstance(c.get('expiry'), float): c['expiry'] = int(c['expiry']) self.driver.add_cookie(c) elif path.endswith('.txt'): # Assume Mozilla/Netscape format jar = MozillaCookieJar(path) jar.load() for c in jar: cookie = { k: getattr(c, k) for k in ['domain', 'name', 'value', 'secure', 'path'] } if c.expires: cookie['expiry'] = c.expires self.driver.add_cookie(cookie) else: raise Exception('Unrecognized cookie extension') return True
def __init__(self, **kwargs): headers = kwargs.get('headers') if headers is None: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', 'Accept-Language': 'en-US, en' } # Set params for use later on # self.params = kwargs # Begin session self.session = requests.Session() self.session.headers = headers # Set cookies if present cookies = kwargs.get('cookies') cj = MozillaCookieJar(cookies) if cookies: # is not None # Only attempt to load if the cookie file exists. if os.path.exists(cookies): cj.load(ignore_discard=True, ignore_expires=True) else: raise CookieError( 'The file "{}" could not be found.'.format(cookies)) self.session.cookies = cj
def __init__(self, cookies=None, proxy=None): """Initialise a new session for making requests.""" self.session = requests.Session() self.session.headers = self.__HEADERS # use proxy if present if(proxy is not None and isinstance(proxy, str)): if(proxy.startswith('socks5:')): print('detected socks5 protocol, please use socks5h', file=sys.stderr) print('use proxy', proxy) self.session.proxies.update({ 'http': proxy, 'https': proxy }) # set the place for chat_db self.chat_db: Optional[DBChat] = None cj = MozillaCookieJar(cookies) if cookies is not None: # Only attempt to load if the cookie file exists. if os.path.exists(cookies): cj.load(ignore_discard=True, ignore_expires=True) else: raise CookieError( "The file '{}' could not be found.".format(cookies)) self.session.cookies = cj
def test_authholder_request_simple(auth_holder): """Load credentials the first time, hit the network, save credentials.""" # save a cookie to be used fake_cookiejar = MozillaCookieJar(auth_holder._cookiejar_filepath) fake_cookiejar.set_cookie(get_cookie()) fake_cookiejar.save() other_cookie = get_cookie(value="different") def fake_request(self, method, url, json, headers): # check it was properly called assert method == "testmethod" assert url == "testurl" assert json == "testbody" assert headers == {"User-Agent": build_user_agent()} # check credentials were loaded at this time assert auth_holder._cookiejar is not None # modify the credentials, to simulate that a re-auth happened while the request auth_holder._cookiejar.set_cookie(other_cookie) return "raw request response" with patch("macaroonbakery.httpbakery.Client.request", fake_request): resp = auth_holder.request("testmethod", "testurl", "testbody") # verify response (the calling checks were done above in fake_request helper) assert resp == "raw request response" # check the credentials were saved (that were properly loaded was also check in above's helper) new_cookiejar = MozillaCookieJar(auth_holder._cookiejar_filepath) new_cookiejar.load() assert list(new_cookiejar)[0].value == other_cookie.value
def would_decorate_driver_with_cookies(driver, cookie_file): jar = MozillaCookieJar(cookie_file) jar.load() decorate = compose(driver.add_cookie, jar_cookie_to_webdriver_cookie) return map(decorate, jar)
def login(): global site if site and site.logged_in: logger.info('Already logged into API site') return True api_creds = get_credentials() if api_creds is None: logger.warning('Not creating API site object, no credentials') return False cookie_path = '/tmp/cookies.txt' cookie_jar = MozillaCookieJar(cookie_path) if os.path.exists(cookie_path): # Load cookies from file, including session cookies (expirydate=0) cookie_jar.load(ignore_discard=True, ignore_expires=True) logger.info('Loaded %d cookies', len(cookie_jar)) connection = requests.Session() connection.cookies = cookie_jar site = mwclient.Site('en.wikipedia.org', clients_useragent=_ua, pool=connection) if not site.logged_in: try: logger.info('Logging into API site') site.login(api_creds['user'], api_creds['pass']) logger.info('Saving cookies') cookie_jar.save(ignore_discard=True, ignore_expires=True) except mwclient.errors.LoginError: logger.exception('Exception logging into Wikipedia') return False return True
def get_gerrit_rest_api(cookie_jar_path: str, gerrit_url: str) -> GerritRestAPI: cookie_jar = MozillaCookieJar(cookie_jar_path) cookie_jar.load() auth = HTTPCookieAuth(cookie_jar) rest = GerritRestAPI(url=gerrit_url, auth=auth) return rest
class MozillaCookiejarHandler: """Загружает и сохраняет cookies в формате Mozilla""" def __init__(self, filename='cookies.txt'): self.mozillaCookieJar = None self.filename = filename def cookiejar(self, policy): def_policy = DefaultCookiePolicy(**policy) self.mozillaCookieJar = MozillaCookieJar(self.filename, def_policy) if exists(self.filename): self.mozillaCookieJar.load(self.filename) handler = HTTPCookieProcessor(self.mozillaCookieJar) return handler def save_cookies(self): self.mozillaCookieJar.save() def make_cookies(self, response, request): cook = self.mozillaCookieJar.make_cookies(response, request) return cook def clear_cookies(self, domain=None, path=None, name=None): self.mozillaCookieJar.clear(domain, path, name)
def urlretrieve(url: str, filename: str, context: ssl.SSLContext, reporthook=None, cookies_path=None): """ original source: https://github.com/python/cpython/blob/ 21bee0bd71e1ad270274499f9f58194ebb52e236/Lib/urllib/request.py#L229 Because urlopen also supports context, I decided to adapt the download function. """ url_parsed = urlparse.urlparse(url) request = urllib.request.Request(url=url, headers=RequestHelper.stdHeader) if cookies_path is not None: cookie_jar = MozillaCookieJar(cookies_path) cookie_jar.load(ignore_discard=True, ignore_expires=True) cookie_jar.add_cookie_header(request) with contextlib.closing(urllib.request.urlopen(request, context=context)) as fp: headers = fp.info() # Just return the local path and the 'headers' for file:// # URLs. No sense in performing a copy unless requested. if url_parsed.scheme == 'file' and not filename: return os.path.normpath(url_parsed.path), headers if not filename: raise RuntimeError('No filename specified!') tfp = open(filename, 'wb') with tfp: result = filename, headers # read overall read = 0 # 4kb at once bs = 1024 * 8 blocknum = 0 # guess size size = int(headers.get('Content-Length', -1)) if reporthook: reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) if size >= 0 and read < size: raise ContentTooShortError('retrieval incomplete: got only %i out of %i bytes' % (read, size), result) return result
def __init__(self): # error message self.error = None # establish connection self.session = build_opener() # add proxy handler if needed if config['proxy']: if any(config['proxies'].values()): self.session.add_handler(ProxyHandler(config['proxies'])) logger.debug("Proxy is set!") else: self.error = "Proxy enabled, but not set!" # change user-agent self.session.addheaders = [('User-Agent', config['ua'])] # load local cookies mcj = MozillaCookieJar() try: mcj.load(FILE_C, ignore_discard=True) if 'uid' in [cookie.name for cookie in mcj]: # if cookie.expires < int(time.time()) logger.info("Local cookies is loaded") self.session.add_handler(HTTPCookieProcessor(mcj)) else: logger.info("Local cookies expired or bad") logger.debug(f"That we have: {[cookie for cookie in mcj]}") mcj.clear() self.login(mcj) except FileNotFoundError: self.login(mcj)
def __init__(self, username, password, **kwargs): """ init params :param username: your username :type username: str :param password: your password :type password: str """ # path setup self._path = os.getcwd() self._cookies_path = os.path.join(os.getcwd(), 'cookies_data') self._data_path = os.path.join(os.getcwd(), 'freenom_data') # user setup self.username = username self.password = password # request setup self.headers = { 'Host': 'my.freenom.com', 'Referer': 'https://my.freenom.com/clientarea.php', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36' } self.session = requests.session() self.session.headers = self.headers self.token = '' # cookies setup cookies = MozillaCookieJar(filename=self._cookies_path) if os.path.isfile(self._cookies_path): cookies.load(self._cookies_path, ignore_discard=True, ignore_expires=True) self.session.cookies = cookies # option setup --dev self.timeout = kwargs.get('timeout', 22) self.saveHtml = kwargs.get('saveHtml', False) self._RequireData()
def load_cookies(self, cookies_dir): cj = MozillaCookieJar() cj.load(cookies_dir) cookies = {} for cookie in cj: cookies[cookie.name] = cookie.value return cookies
def set_cookies_from_file(self, cookies_file): cookies = MozillaCookieJar(cookies_file) cookies.load() for cookie in cookies: try: self.driver.add_cookie(cookie.__dict__) except InvalidCookieDomainException: pass
def main(threadname): global total_count url = "https://oceanhero.today/web?q=test" cj = MozillaCookieJar('cookies.txt') cj.load(ignore_expires=True) while True: requests.get(url, cookies=cj) print(f"<{threadname}> {total_count}") total_count += 1
def save_cookie(): cookiejar = MozillaCookieJar('cookie.txt') #加载cookie信息s cookiejar.load(ignore_discard=True) handler = request.HTTPCookieProcessor(cookiejar) opener = request.build_opener(handler) opener.open('https://www.baidu.com') cookiejar.save( ignore_discard=True) #ignore_discard=True 设置了表示保存即将过期的cookie信息
def load_cookie_from_local(self): if os.path.exists(self.cookie_path): s = MozillaCookieJar(self.cookie_path) s.load(self.cookie_path, ignore_discard=True, ignore_expires=True) session = requests.Session() session.cookies = s self.login_session = session else: pass
def use_cookie(): info_url = 'http://www.sxt.cn/index/user.html' headers = {'User-agent': UserAgent().chrome} request = Request(info_url, headers=headers) cookie_jar = MozillaCookieJar() cookie_jar.load('cookie.txt', ignore_expires=True, ignore_discard=True) handler = HTTPCookieProcessor(cookie_jar) opener = build_opener(handler) response = opener.open(request) print(response.read().decode())
def import_cookiejar(filename): from http.cookiejar import MozillaCookieJar cookies_obj = MozillaCookieJar(filename) cookies_obj.load(ignore_discard=True, ignore_expires=True) cookies = {} for domain in cookies_obj._cookies.values(): for key, cookie in list(domain.values())[0].items(): cookies[key] = cookie.value return cookies
def use_cookie(): url = "https://digital.darentui.com/api/v1/rank/dy_sales_rank/?page=1&date=2020-04-27" headers = {"User-Agent": UserAgent().chrome} request = Request(url, headers=headers) cookie_jar = MozillaCookieJar() cookie_jar.load("cookie.txt", ignore_discard=True, ignore_expires=True) handler = HTTPCookieProcessor(cookie_jar) opener = build_opener(handler) response = opener.open(request) print(response.read().decode())
def have_cookie_login(self): print('Test cookies...') cookie = MozillaCookieJar() cookie.load(self.cookiesFile, ignore_discard=True, ignore_expires=True) self.build_opener(cookie, self.use_proxy) page = self.get_page_data(self.userSetUrl) if not search('page-setting-user', page): print('This cookies has been invalid.') remove(self.cookiesFile) self.have_not_cookie_login()
def use_cookie(): info_url = "http://www.sxt.cn/index/user.html" request = Request(info_url, headers=headers) cookie_jar = MozillaCookieJar() cookie_jar.load("cookie.txt", ignore_expires=True, ignore_discard=True) handler = HTTPCookieProcessor(cookie_jar) opener = build_opener(handler) response = opener.open(request) cookie_jar.save("cookie.txt", ignore_expires=True, ignore_discard=True) print(response.read().decode())
def get_cookie(): info_url = "https://i.njtech.edu.cn/index.html" headers = {"User-Agent": UserAgent().chrome} request = Request(info_url, headers=headers) cookie_jar = MozillaCookieJar() cookie_jar.load("cookie.txt", ignore_discard=True, ignore_expires=True) handler = MozillaCookieJar(cookie_jar) opener = build_opener(handler) response = opener.open(request) print(response.read().decode())
def tryLoadCookies(cookies: MozillaCookieJar): try: cookies.load(ignore_expires=True) for cookie in cookies: cookie.expires = 1551297187 return True except LoadError: print( "Cookie has incorrect format, must have comment containing # Netscape on the top" ) return False
def use_cookie(): info_url = "" headers = { "User-Agent": UserAgent().random, } request = Request(info_url, headers=headers) cookie_jar = MozillaCookieJar() cookie_jar.load("cookie.text", ignore_expires=True, ignore_discard=True) handler = HTTPCookieProcessor(cookie_jar) opener = build_opener(handler) opener.open(request)
def get_cookies(): """Get cookies from the cookies.txt file after a login""" log = logging.getLogger() cj = MozillaCookieJar() cj.load('cookies.txt', ignore_discard=True, ignore_expires=True) for cookie in cj: cookie.expires = time.time() + 14 * 24 * 3600 if DEBUG: log.debug(f"Cookiejar: {cj}") log.debug(f"Cookiejar length: {len(cj)}") return cj
class ScholarQuerier(object): """ ScholarQuerier instances can conduct a search on Google Scholar with subsequent parsing of the resulting HTML content. The articles found are collected in the articles member, a list of ScholarArticle instances. """ # Default URLs for visiting and submitting Settings pane, as of 3/14 GET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_settings?' \ + 'sciifh=1&hl=en&as_sdt=0,5' SET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_setprefs?' \ + 'q=' \ + '&scisig=%(scisig)s' \ + '&inststart=0' \ + '&as_sdt=1,5' \ + '&as_sdtp=' \ + '&num=%(num)s' \ + '&scis=%(scis)s' \ + '%(scisf)s' \ + '&hl=en&lang=all&instq=&inst=569367360547434339&save=' # Older URLs: # ScholarConf.SCHOLAR_SITE + '/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on class Parser(ScholarArticleParser120726): def __init__(self, querier): ScholarArticleParser120726.__init__(self) self.querier = querier def handle_article(self, art): self.querier.add_article(art) def __init__(self): self.articles = [] self.query = None self.cjar = MozillaCookieJar() # If we have a cookie file, load it: if ScholarConf.COOKIE_JAR_FILE and \ os.path.exists(ScholarConf.COOKIE_JAR_FILE): try: self.cjar.load(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log('info', 'loaded cookies file') except Exception,msg: ScholarUtils.log('warn', 'could not load cookies file: %s' % msg) self.cjar = MozillaCookieJar() # Just to be safe self.opener = build_opener(HTTPCookieProcessor(self.cjar)) self.settings = None # Last settings object, if any
def load_cookies(filename): cookies = {} if os.path.exists(filename): cookies_obj = MozillaCookieJar(filename) cookies_obj.load(ignore_discard=False, ignore_expires=False) for domain in cookies_obj._cookies.values(): for cookie_dict in list(domain.values()): for _, cookie in cookie_dict.items(): cookies[cookie.name] = cookie.value return cookies
class ScholarQuerier(object): """ ScholarQuerier instances can conduct a search on Google Scholar with subsequent parsing of the resulting HTML content. The articles found are collected in the articles member, a list of ScholarArticle instances. """ # Default URLs for visiting and submitting Settings pane, as of 3/14 GET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_settings?' \ + 'sciifh=1&hl=en&as_sdt=0,5' SET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_setprefs?' \ + 'q=' \ + '&scisig=%(scisig)s' \ + '&inststart=0' \ + '&as_sdt=1,5' \ + '&as_sdtp=' \ + '&num=%(num)s' \ + '&scis=%(scis)s' \ + '%(scisf)s' \ + '&hl=en&lang=all&instq=&inst=569367360547434339&save=' # Older URLs: # ScholarConf.SCHOLAR_SITE + '/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on class Parser(ScholarArticleParser120726): def __init__(self, querier): ScholarArticleParser120726.__init__(self) self.querier = querier def handle_article(self, art): self.querier.add_article(art) def __init__(self): self.articles = [] self.query = None self.cjar = MozillaCookieJar() # If we have a cookie file, load it: if ScholarConf.COOKIE_JAR_FILE and \ os.path.exists(ScholarConf.COOKIE_JAR_FILE): try: self.cjar.load(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log('info', 'loaded cookies file') except Exception, msg: ScholarUtils.log('warn', 'could not load cookies file: %s' % msg) self.cjar = MozillaCookieJar() # Just to be safe self.opener = build_opener(HTTPCookieProcessor(self.cjar)) self.settings = None # Last settings object, if any
def loadCookie(): from http.cookiejar import MozillaCookieJar cookiejar = MozillaCookieJar("cookie.txt") cookiejar.load(ignore_discard=True, ignore_expires=True) for c in cookiejar: print(c.__dict__) handler = request.HTTPCookieProcessor(cookiejar) opener = request.build_opener(handler) req = request.Request("http://httpbin.org/cookies", headers=headers) resp = opener.open(req) print(resp.read().decode('utf-8'))
def cookie_from_file(cookiefile): """ reads authentication cookie from file @params: cookiefile - File containing the cookies. """ jar = MozillaCookieJar(cookiefile) jar.load(ignore_expires=True) # set expiration time to avoid errors for cookie in jar: cookie.expires = time.time() + 14 * 24 * 3600 assert (len(jar) > 0) return jar
class Session(object): def __init__(self, app_name, app_version, data_path, **kwargs): self.app_name = app_name self.app_version = app_version if not data_path or not os.path.isdir(data_path): raise Exception('invalid data_path: %s' % data_path) self.cookie_jar = MozillaCookieJar(os.path.join(data_path, default.COOKIES_FILENAME)) try: self.cookie_jar.load() except EnvironmentError: pass self.opener = build_opener( HTTPRedirectHandler(), HTTPCookieProcessor(self.cookie_jar)) super(Session, self).__init__(**kwargs) def open(self, request, default_charset=None): request.add_header('User-Agent', util.user_agent(self.app_name, self.app_version)) system_string = json.dumps(util.system_info(self.app_name, self.app_version)) request.add_header('X-Sputnik-System', system_string) r = self.opener.open(request) if hasattr(r.headers, 'get_content_charset'): # py3 charset = r.headers.get_content_charset() or default_charset elif hasattr(r.headers, 'getparam'): # py2 charset = r.headers.getparam('charset') or default_charset else: charset = default_charset if charset is None: return r return codecs.getreader(charset)(r) def __del__(self): if hasattr(self, 'cookie_jar'): self.cookie_jar.save()
class Session(Base): def __init__(self, data_path, **kwargs): if not validation.is_data_path(data_path): raise Exception('invalid data_path: %s' % data_path) self.cookie_jar = MozillaCookieJar(os.path.join(data_path, default.COOKIES_FILENAME)) try: self.cookie_jar.load() except EnvironmentError: pass self.opener = build_opener( HTTPRedirectHandler(), HTTPCookieProcessor(self.cookie_jar)) super(Session, self).__init__(**kwargs) def open(self, request, default_charset=None): request.add_header('User-Agent', self.s.user_agent()) if self.s.name: request.add_header('X-Sputnik-Name', self.s.name) if self.s.version: request.add_header('X-Sputnik-Version', self.s.version) r = self.opener.open(request) if hasattr(r.headers, 'get_content_charset'): # py3 charset = r.headers.get_content_charset() or default_charset elif hasattr(r.headers, 'getparam'): # py2 charset = r.headers.getparam('charset') or default_charset else: charset = default_charset if charset is None: return r return codecs.getreader(charset)(r) def __del__(self): if hasattr(self, 'cookie_jar'): self.cookie_jar.save()
class Aurploader(object): """ A user object for interactive actions. """ def __init__( self, cookiejar_path=None, cookiejar=None, token=None, categories=None ): """ cookiejar: a MozillaCookieJar object token: a user token for submitting form data categories: package categories """ if cookiejar_path is None: cookiejar_path = get_default_cookiejar_path() self.cookiejar_path = cookiejar_path if cookiejar is None: self.cookiejar = MozillaCookieJar() self.load_cookies() else: self.cookiejar = cookiejar # TODO # Find way to use this with URL opener. (urlopen accepts a capath arg) # CA_PATH = '/etc/ssl/certs' self.opener = build_opener(HTTPCookieProcessor(self.cookiejar)) self.token = token self.categories = categories # self.rpc = AUR(ttl=0, clean=False) self.rpc = AUR() def get_info(self, pkgname): """ Get package information from the RPC interface. """ for pkg in self.rpc.info(pkgname): return pkg def parse_pkgsubmit(self): """ Parse the pkgsubmit page. This will return package categories along with hidden inputs such as the the token. If the returned values are empty then the user is not currently logged in, so it doubles as a login check. """ parser = pkgsubmitParser() with self.opener.open(PKGSUBMIT_URL) as f: parser.feed(f.read().decode()) if parser.token: self.token = parser.token self.categories = parser.categories def login(self, user=None, passwd=None, login_file=None, remember_me=True): """ Log in to the AUR. """ if login_file is not None: user, passwd = load_login_file(login_file) if user is None or passwd is None: self.rpc.log("logging in to the AUR") if user is None: user = input('Username: '******'user', user), ('passwd', passwd) ] if remember_me: data.append(('remember_me', '1')) data = urlencode(data).encode('UTF-8') with self.opener.open(LOGIN_URL, data) as f: pass # python3-AUR could be used to cache the data, but sometimes the data must be # fresh, such as when confirming the upload. def submit_package_form( self, pkginfo, action, confirm_delete=False, merge_into=None, comment=None, category=None, ): """ Submit a form to the AUR. """ ID = pkginfo['ID'] url = AUR_URL + '/packages/{}/'.format(pkginfo['Name']) # Old form actions, converted to links with AUR 2.0 do_actions = { # 'do_Vote' : 'Vote', # 'do_UnVote' : 'UnVote', # 'do_Notify' : 'Notify', # 'do_UnNotify' : 'UnNotify', # 'do_Flag' : 'Flag Out-of-date', 'do_Disown' : 'Disown Packages', 'do_Delete' : 'Delete Packages', 'do_Adopt' : 'Adopt Packages', } if action in do_actions: url = AUR_URL + '/packages/' data = [ ('IDs[{!s}]'.format(ID), '1'), ('ID', ID), ('token', self.token), (action, do_actions[action]) ] if confirm_delete: data.append(('confirm_Delete', '1')) if merge_into: data.append(('merge_Into', merge_into)) elif action == 'comment': if comment: data = ( ('ID', ID), ('token', self.token), ('comment', comment) ) else: raise AurploaderError("no comment submitted") elif action == 'do_ChangeCategory': if category: data = ( ('action', 'do_ChangeCategory'), ('category_id', category), ('token', self.token) ) else: raise AurploaderError("no category submitted for do_ChangeCategory") elif action == 'do_DeleteComment': if category: data = ( ('action', 'do_DeleteComment'), ('comment_id', comment_id), ('token', self.token), ('submit', '1') ) else: raise AurploaderError("no category submitted for do_ChangeCategory") data = urlencode(data).encode('UTF-8') with self.opener.open(url, data) as f: pass def do_package_action(self, pkginfo, action): """ Perform one of the link-based package actions. Use submit_package_form() for form-based actions. """ actions = PACKAGE_ACTIONS if action in actions: url = AUR_URL + '/packages/{}/{}'.format(pkginfo['Name'], action) with self.opener.open(url) as f: pass else: raise AurploaderError("unrecognized action ({})".format(action) ) def prompt_categories(self, name, default_category=None): """ Prompt the user to select a category for the given package. """ if not self.categories: raise AurploaderError("no categories") if default_category not in self.categories: default_category = None while True: print('Select category for {}'.format(name)) for n in sorted(self.categories): print(' {:2d}) {}'.format(n, self.categories[n])) print('Enter "x" to skip this package.') if default_category: category = input('Category [{}]: '.format(default_category)) else: category = input('Category: ') if category.lower() == 'x': return None elif not category and default_category: return default_category else: try: category = int(category) if category in self.categories: return category except ValueError: continue # Python has had an open request for multipart/form-data since 2008-06-30 # http://bugs.python.org/issue3244 # At the time of writing, the latest submitted code does not work and hacking # together something that does is just not worth it right now. def upload_pkg(self, fpath, category=None, auto_category=False, confirm=True): """ Upload a package to the AUR. """ fname = os.path.basename(fpath) pkginfo = None try: pkg, ext = fname.split('.src.', 1) name, ver, rel = pkg.rsplit('-', 2) except ValueError: raise AurploaderError('unexpected filename format: {}\nexpected <pkgname>-<pkgver>-<pkgrel>.src.<ext>'.format(fname)) if category not in self.categories: category = None if category is None: pkginfo = self.get_info(name) if pkginfo: category = int(pkginfo['CategoryID']) if category is None or not auto_category: category = self.prompt_categories(name, default_category=category) # This is not an error. A user may abort the upload by entering "x" at the # category prompt. if category is None: return cmd = [ '/usr/bin/curl', '-#', '-H', 'Expect:', '-b', self.cookiejar_path, '-c', self.cookiejar_path, '-F', 'category={}'.format(category), '-F', 'pfile=@{}'.format(fpath), '-F', 'pkgsubmit=1', '-F', 'token={}'.format(self.token) ] cmd.append(PKGSUBMIT_URL) self.save_cookies() with open(os.devnull, 'w') as null: p = Popen(cmd, stdout=null) e = p.wait() if e != 0: raise AurploaderError("curl exited with non-zero status ({:d})".format(e)) self.load_cookies() if confirm: expected = '{}-{}'.format(ver, rel) ttl = self.rpc.ttl self.rpc.ttl = 0 try: pkginfo = self.get_info(name) finally: self.rpc.ttl = ttl if not pkginfo or pkginfo['Version'] != expected: raise AurploaderError('failed to confirm upload') return pkginfo def save_cookies(self, path=None): """ Save cookie jar. """ if path is None: path = self.cookiejar_path if path is None: raise AurploaderError('no cookiejar path given') # For Curl compatibility (not sure which one fails to comply with the standard. for cookie in self.cookiejar: if not cookie.expires: cookie.expires = 0 self.cookiejar.save(path, ignore_discard=True, ignore_expires=True) def load_cookies(self, path=None): """ Load cookie jar. """ if path is None: path = self.cookiejar_path if path is None: raise AurploaderError('no cookiejar path given') try: # For Curl compatibility (not sure which one fails to comply with the standard. self.cookiejar.load(path, ignore_discard=True, ignore_expires=True) for cookie in self.cookiejar: if not cookie.expires: cookie.expires = None except LoadError: pass except IOError as e: if e.errno != errno.ENOENT: raise e def initialize(self, user=None, passwd=None, login_file=None, cookiejar_path=None): """ Login if necessary and load categories and token. """ self.load_cookies(cookiejar_path) self.parse_pkgsubmit() if not self.categories or not self.token: self.login(user=user, passwd=passwd, login_file=login_file) self.parse_pkgsubmit() if not self.categories or not self.token: raise AurploaderError('login appears to have failed\n') elif cookiejar_path: self.save_cookies(cookiejar_path)
"http://security.stackexchange.com/users/8857/b-con", "http://stackoverflow.com/users/1361836/b-con" ] logging.basicConfig(filename="/tmp/site-ping.log", datefmt="%m-%d %H:%M", level=logging.DEBUG) # Extract the cookies from Firefox. The script to do so is co-located. path = os.path.dirname(os.path.realpath(__file__)) p = subprocess.call(path + "/extract-cookies.sh") # Load the cookies. cj = MozillaCookieJar("/tmp/firefox-cookies.txt") try: cj.load() except FileNotFoundErr as ex: logging.error(ex) quit(1) # Use the cookies to visit each of the URLs. for url in urls: opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) response = opener.open(url) html = response.read().decode("utf-8") response.close() # The "votes" tab only appears on the user profile when you're logged in. match = re.search("tab=votes", html) if match:
class BaseClient(object): """ Базовый класс для работы с удалённым API """ username = None password = None url = 'http://localhost:8000/api/' headers = { "Content-type": "application/json", "Accept": "application/json", "Accept-Encoding": "gzip, deflate", } timeout = 10000 cookiejar = None print_info = False code_page = 'utf-8' use_basic_auth = False def __init__(self, cookie_filename=None, **kwargs): for key, val in kwargs.items(): setattr(self, key, val) if cookie_filename: self.set_cookiejar(cookie_filename) def set_cookiejar(self, name): self.cookiejar = MozillaCookieJar(name) try: self.cookiejar.load() except IOError: self.cookiejar.save() def get_request(self, data): """ Возвращает новый объект запроса. """ params = urlencode({'jsonData': data}) params = params.encode('ascii') headers = {} headers.update(self.headers) if self.use_basic_auth and self.username and self.password: s = '%s:%s' % (self.username, self.password) if six.PY3: b = bytes(s, 'utf-8') else: b = bytes(s.encode('utf-8')) headers['Authorization'] = b'Basic ' + base64.b64encode(b) request = Request(url=self.url, data=params, headers=headers) return request def get_opener(self): """ Возвращает новый обработчик запроса с необходимыми процессорами. """ args = () if not self.cookiejar is None: cookiehand = HTTPCookieProcessor(self.cookiejar) args += (cookiehand,) return build_opener(*args) def get_response(self, request): """ Возвращает новый обработчик запроса и устанавливает куки. """ opener = self.get_opener() try: response = opener.open(request, timeout=self.timeout) except IOError as e: raise e if not self.cookiejar is None: self.cookiejar.save() return response def get_result(self, data): """ Запрашивает данные из API """ if self.print_info: print('Kwargs: %s' % data.get('kwargs', {})) jsondata = json.dumps(data) request = self.get_request(jsondata) response = self.get_response(request) info = response.info() encoding = info.get('Content-encoding', None) if self.print_info: print('Status: %s' % response.code) print(info) data = response.read() # Описание использования windowBits компрессора zlib # находится по ссылке http://www.zlib.net/manual.html#Advanced # Согласно нему: # RFC 1950 ZLIB (тоже DEFLATE) wbits от 8 до 15 # RFC 1951 RAW DEFLATE wbits от -8 до -15 # RFC 1952 GZIP wbits в диапазоне от 8 до 15 с инкрементом 16 if encoding == 'deflate': try: return zlib.decompress(data) except zlib.error: return zlib.decompress(data, -zlib.MAX_WBITS) elif encoding == 'gzip': return zlib.decompress(data, zlib.MAX_WBITS | 16) else: return data def json_loads(self, data): """ Переобразовывает JSON в объекты Python, учитывая кодирование """ data = data.decode(self.code_page) data = json.loads(data) return data def prepare_data(self, data): """ Предназначен для переопределения в наследуемых классах. Здесь просто добавляются учётные данные. """ if self.username and not self.use_basic_auth: data['username'] = self.username data['password'] = self.password return data def clean(self, data): """ Преобразует полученные данные """ data = self.json_loads(data) if data is None: return data status = data.get('status', None) if status != 200: msg = data.get('message', None) if msg: if six.PY3: error = '%s - %s' % (status, msg) else: error = b'%s - %s' % (status, msg.encode(self.code_page)) else: error = data raise RemoteAPIError(error) return data['data'] def method(self, method, **kwargs): """ Вызывает метод API и возвращает чистые данные """ data = {'method': method, 'kwargs': kwargs} data = self.prepare_data(data) data = self.get_result(data) data = self.clean(data) return data
def put(self, job_dict, args): if not isinstance(job_dict, dict): raise Exception("Jobs must be submitted as dictionaries") # Make this a DotDict to make accessing keys cleaner job = DotDict(job_dict) # URL is the only thing required in each datum if not "url" in job: raise Exception("No url specified") # Add an http prefix onto our URL, if its not # explicitly defined as HTTP/HTTPS if job.url[:4] != "http": job.url = "http://" + job.url # Other options can be inherited from those specified # on the command line. Do some sanity checking here, too # Set our method (GET, POST, etc) if not "method" in job: job.method = args.method # Read in our job delay... try: job.delay = (job.delay/1000.0 if 'delay' in job else args.delay/1000.0) except ValueError: raise Exception("Delay must be an integer") # ... and set our query parameters job.params = {} job.orig_url = job.url if "?" in job.url: job.url, query_string = job.url.split("?", 1) job.params = parse_qs(query_string) # ... and our authentication (if any) if "auth" in job: job.auth = job.auth.split(":",1) elif args.auth: job.auth = args.auth.split(":",1) else: job.auth = None job.auth = None if "authtype" in job: job.authtype = job.authtype else: job.authtype = args.authtype if job.auth and len(job.auth) == 1: raise Exception("Credentials must be in username:password format") if job.authtype not in ("basic","digest"): raise Exception("Auth type must be one of: basic, digest") # ... and our job counter try: job.count = int(job.count) if 'count' in job else args.num except ValueError: raise Exception("Count must be an integer") # ... and cookies! try: cj = MozillaCookieJar() if "cookiejar" in job: cj.load(job.cookiejar) job.cookiejar = cj elif args.cookiejar: cj.load(args.cookiejar) job.cookiejar = cj else: job.cookiejar = None except Exception as e: raise Exception("Unable to load cookie jar: {}".format(e)) # ... our insecure option if not "insecure" in job: job.insecure = args.insecure else: if not isinstance(job.insecure, bool): raise Exception("Insecure flag must be a boolean") # Fix up method case; RFCs 7230/1 state method is case sensitive, # but all current recognized methods are upper case, soooo... job.method = job.method.upper() # Now turn our list of header key:value pairs into # the dict that the requests module requires header_list = [] # Coalesce headers from the command line and the job/url file, if any if "headers" in job: if not isinstance(job.headers, list): raise Exception("Headers must be in list form") header_list = job.headers + args.header else: header_list = args.header # Convert our list of colon-delimited k:v pairs to a dict header_dict = {} for kv in header_list: try: key, val = [s.strip() for s in kv.split(':')] header_dict[key.lower()] = val except: raise Exception( "'{}' header must be in 'key:value' format".format(kv) ) # Set our user agent here, since it is a header too if not "user-agent" in header_dict: if "agent" in job: header_dict["user-agent"] = job.agent else: header_dict["user-agent"] = args.agent # Override the connection header if user has requests keep-alives # be disabled if args.nokeepalive: header_dict["connection"] = "close" # Overwrite the header list with the header dict for requests job.headers = header_dict # Set up POST file reads upload_files = (job.upload + args.upload if "upload" in job else args.upload) job.upload = [] for file_data in upload_files: i = file_data.split(":", 2) if len(i) < 2: raise Exception("Upload files must be in " "form_var:file_path[:content_type] format") file_var, file_path = i # Make sure our file exists try: open(file_path, "rb") except: raise Exception( "{} is not a readable file!".format(file_path) ) # Now guess the mime type if we weren't provided one explicitly if len(i) == 3: mime_type = i[2] else: mime_type = (mimetypes.guess_type(file_path)[0] or 'application/octet-stream') # Now stick the file data in our upload list job.upload.append((file_var, file_path, mime_type)) # Override the method if we have multipart files to POST if job.upload: job.method = "POST" # Now insert the job into our work queue with self.lock: self.jobs.append(job) self.length += 1
class ScholarQuerier(object): """ ScholarQuerier instances can conduct a search on Google Scholar with subsequent parsing of the resulting HTML content. The articles found are collected in the articles member, a list of ScholarArticle instances. """ class Parser(ScholarArticleParser): def __init__(self, querier): ScholarArticleParser.__init__(self) self.querier = querier def handle_article(self, art): self.querier.add_article(art) def __init__(self): self.articles = [] self.query = None self.cjar = MozillaCookieJar() # If we have a cookie file, load it: if ScholarConf.COOKIE_JAR_FILE and \ os.path.exists(ScholarConf.COOKIE_JAR_FILE): try: self.cjar.load(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log('info', 'loaded cookies file') except Exception as msg: ScholarUtils.log('warn', 'could not load cookies file: %s' % msg) self.cjar = MozillaCookieJar() # Just to be safe self.opener = build_opener(HTTPCookieProcessor(self.cjar)) self.settings = None # Last settings object, if any def send_query(self, query): """ This method initiates a search query (a ScholarQuery instance) with subsequent parsing of the response. """ self.clear_articles() self.query = query html = self._get_http_response(url=query.get_url(), log_msg='dump of query response HTML', err_msg='results retrieval failed') if html is None: return #print len(html) self.parse(html) def parse(self, html): """ This method allows parsing of provided HTML content. """ parser = self.Parser(self) parser.parse(html) def add_article(self, art): #self.get_citation_data(art) self.articles.append(art) def clear_articles(self): """Clears any existing articles stored from previous queries.""" self.articles = [] def _get_http_response(self, url, log_msg=None, err_msg=None): """ Helper method, sends HTTP request and returns response payload. """ if log_msg is None: log_msg = 'HTTP response data follow' if err_msg is None: err_msg = 'request failed' try: ScholarUtils.log('info', 'requesting %s' % unquote(url)) req = Request(url=url, headers={'User-Agent': ScholarConf.USER_AGENT}) hdl = self.opener.open(req) html = hdl.read() ScholarUtils.log('debug', log_msg) ScholarUtils.log('debug', '>>>>' + '-'*68) ScholarUtils.log('debug', 'url: %s' % hdl.geturl()) ScholarUtils.log('debug', 'result: %s' % hdl.getcode()) ScholarUtils.log('debug', 'headers:\n' + str(hdl.info())) ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) # For Python 3 ScholarUtils.log('debug', '<<<<' + '-'*68) return html except Exception as err: ScholarUtils.log('info', err_msg + ': %s' % err) return None
class ScholarQuerier(object): """ ScholarQuerier instances can conduct a search on Google Scholar with subsequent parsing of the resulting HTML content. The articles found are collected in the articles member, a list of ScholarArticle instances. """ SCHOLAR_QUERY_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \ + 'as_q=%(words)s' \ + '&as_epq=%(phrase)s' \ + '&as_oq=%(words_some)s' \ + '&as_eq=%(words_none)s' \ + '&as_occt=%(scope)s' \ + '&as_sauthors=%(authors)s' \ + '&as_publication=%(pub)s' \ + '&as_ylo=%(ylo)s' \ + '&as_yhi=%(yhi)s' \ + '&btnG=&hl=en&as_sdt=0,5&num=%(num)s' # Default URLs for visiting and submitting Settings pane, as of 3/14 GET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_settings?' \ + 'sciifh=1&hl=en&as_sdt=0,5' SET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_setprefs?' \ + 'q=' \ + '&scisig=%(scisig)s' \ + '&inststart=0' \ + '&as_sdt=1,5' \ + '&as_sdtp=' \ + '&num=%(num)s' \ + '&scis=%(scis)s' \ + '%(scisf)s' \ + '&hl=en&lang=all&instq=&inst=569367360547434339&save=' # Older URLs: # ScholarConf.SCHOLAR_SITE + '/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on class Parser(ScholarArticleParser120726): def __init__(self, querier): ScholarArticleParser120726.__init__(self) self.querier = querier def handle_article(self, art): self.querier.add_article(art) def __init__(self): self.articles = [] self.query = None self.cjar = MozillaCookieJar() # If we have a cookie file, load it: if ScholarConf.COOKIE_JAR_FILE and \ os.path.exists(ScholarConf.COOKIE_JAR_FILE): try: self.cjar.load(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log('debug', 'loaded cookies file') except Exception as msg: ScholarUtils.log('warn', 'could not load cookies file: %s' % msg) self.cjar = MozillaCookieJar() # Just to be safe self.opener = build_opener(HTTPCookieProcessor(self.cjar)) self.settings = None # Last settings object, if any def apply_settings(self, settings): """ Applies settings as provided by a ScholarSettings instance. """ if settings is None or not settings.is_configured(): return True self.settings = settings # This is a bit of work. We need to actually retrieve the # contents of the Settings pane HTML in order to extract # hidden fields before we can compose the query for updating # the settings. try: req = Request(url=self.GET_SETTINGS_URL, headers={'User-Agent': ScholarConf.USER_AGENT}) hdl = self.opener.open(req) html = hdl.read() except Exception as err: ScholarUtils.log('debug', 'requesting settings failed: %s' % err) return False # Now parse the required stuff out of the form. We require the # "scisig" token to make the upload of our settings acceptable # to Google. soup = BeautifulSoup(html) tag = soup.find(name='form', attrs={'id': 'gs_settings_form'}) if tag is None: ScholarUtils.log('debug', 'parsing settings failed: no form') return False tag = tag.find('input', attrs={'type':'hidden', 'name':'scisig'}) if tag is None: ScholarUtils.log('debug', 'parsing settings failed: scisig') return False urlargs = {'scisig': tag['value'], 'num': settings.per_page_results, 'scis': 'no', 'scisf': '' } if settings.citform != 0: urlargs['scis'] = 'yes' urlargs['scisf'] = '&scisf=%d' % settings.citform try: req = Request(url=self.SET_SETTINGS_URL % urlargs, headers={'User-Agent': ScholarConf.USER_AGENT}) hdl = self.opener.open(req) except Exception as err: ScholarUtils.log('debug', 'applying settings failed: %s' % err) return False ScholarUtils.log('debug', 'settings applied') return True def send_query(self, query, scholar_url=None): """ This method initiates a search query (a ScholarQuery instance) with subsequent parsing of the response. """ self.clear_articles() self.query = query url = scholar_url if scholar_url else self.SCHOLAR_QUERY_URL urlargs = {'words': query.words or '', 'words_some': query.words_some or '', 'words_none': query.words_none or '', 'phrase': query.phrase or '', 'scope': 'title' if query.scope_title else 'any', 'authors': query.author or '', 'pub': query.pub or '', 'ylo': query.timeframe[0] or '', 'yhi': query.timeframe[1] or '', 'num': query.num_results or ScholarConf.MAX_PAGE_RESULTS } # Make sure we urlencode all this stuff correctly: for key, val in urlargs.items(): urlargs[key] = quote(str(val)) url = url % urlargs ScholarUtils.log('debug', 'query url: %s' % url) req = Request(url=url, headers={'User-Agent': ScholarConf.USER_AGENT}) hdl = self.opener.open(req) html = hdl.read() self.parse(html) def get_citation_data(self, article): """ Given an article, retrieves citation link. Note, this requires that you adjusted the settings to tell Google Scholar to actually provide this information, *prior* to retrieving the article. """ if article['citlink'] is None: return False if article.citation_data is not None: return True try: ScholarUtils.log('debug', 'retrieving citation export from %s' \ % article['citlink']) req = Request(url=article['citlink'], headers={'User-Agent': ScholarConf.USER_AGENT}) hdl = self.opener.open(req) data = hdl.read() article.set_citation_data(data) except Exception as err: ScholarUtils.log('debug', 'requesting citation failed: %s' % err) return False return True def parse(self, html): """ This method allows parsing of provided HTML content. """ parser = self.Parser(self) parser.parse(html) def add_article(self, art): self.get_citation_data(art) self.articles.append(art) def clear_articles(self): """Clears any existing articles stored from previous queries.""" self.articles = [] def save_cookies(self): """ This stores the latest cookies we're using to disk, for reuse in a later session. """ if ScholarConf.COOKIE_JAR_FILE is None: return False try: self.cjar.save(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log('debug', 'saved cookies file') return True except Exception as msg: ScholarUtils.log('warn', 'could not save cookies file: %s' % msg) return False
class ScholarQuerier(object): """ ScholarQuerier instances can conduct a search on Google Scholar with subsequent parsing of the resulting HTML content. The articles found are collected in the articles member, a list of ScholarArticle instances. """ # Default URLs for visiting and submitting Settings pane, as of 3/14 GET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_settings?' \ + 'sciifh=1&hl=en&as_sdt=0,5' SET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_setprefs?' \ + 'q=' \ + '&scisig=%(scisig)s' \ + '&inststart=0' \ + '&as_sdt=1,5' \ + '&as_sdtp=' \ + '&num=%(num)s' \ + '&scis=%(scis)s' \ + '%(scisf)s' \ + '&hl=en&lang=all&instq=&inst=569367360547434339&save=' # Older URLs: # ScholarConf.SCHOLAR_SITE + '/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on class Parser(ScholarArticleParser120726): def __init__(self, querier): ScholarArticleParser120726.__init__(self) self.querier = querier def handle_article(self, art): self.querier.add_article(art) def __init__(self): self.articles = [] self.query = None self.cjar = MozillaCookieJar() # If we have a cookie file, load it: if ScholarConf.COOKIE_JAR_FILE and \ os.path.exists(ScholarConf.COOKIE_JAR_FILE): try: self.cjar.load(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log('info', 'loaded cookies file') except Exception as msg: ScholarUtils.log('warn', 'could not load cookies file: %s' % msg) self.cjar = MozillaCookieJar() # Just to be safe self.opener = build_opener(HTTPCookieProcessor(self.cjar)) self.settings = None # Last settings object, if any def apply_settings(self, settings): """ Applies settings as provided by a ScholarSettings instance. """ if settings is None or not settings.is_configured(): return True self.settings = settings # This is a bit of work. We need to actually retrieve the # contents of the Settings pane HTML in order to extract # hidden fields before we can compose the query for updating # the settings. html = self._get_http_response(url=self.GET_SETTINGS_URL, log_msg='dump of settings form HTML', err_msg='requesting settings failed') if html is None: return False # Now parse the required stuff out of the form. We require the # "scisig" token to make the upload of our settings acceptable # to Google. soup = BeautifulSoup(html) tag = soup.find(name='form', attrs={'id': 'gs_settings_form'}) if tag is None: ScholarUtils.log('info', 'parsing settings failed: no form') return False tag = tag.find('input', attrs={'type':'hidden', 'name':'scisig'}) if tag is None: ScholarUtils.log('info', 'parsing settings failed: scisig') return False urlargs = {'scisig': tag['value'], 'num': settings.per_page_results, 'scis': 'no', 'scisf': ''} if settings.citform != 0: urlargs['scis'] = 'yes' urlargs['scisf'] = '&scisf=%d' % settings.citform html = self._get_http_response(url=self.SET_SETTINGS_URL % urlargs, log_msg='dump of settings result HTML', err_msg='applying setttings failed') if html is None: return False ScholarUtils.log('info', 'settings applied') return True def send_query(self, query): """ This method initiates a search query (a ScholarQuery instance) with subsequent parsing of the response. """ self.clear_articles() self.query = query html = self._get_http_response(url=query.get_url(), log_msg='dump of query response HTML', err_msg='results retrieval failed') if html is None: return self.parse(html) def get_citation_data(self, article): """ Given an article, retrieves citation link. Note, this requires that you adjusted the settings to tell Google Scholar to actually provide this information, *prior* to retrieving the article. """ if article['url_citation'] is None: return False if article.citation_data is not None: return True ScholarUtils.log('info', 'retrieving citation export data') data = self._get_http_response(url=article['url_citation'], log_msg='citation data response', err_msg='requesting citation data failed') if data is None: return False article.set_citation_data(data) return True def parse(self, html): """ This method allows parsing of provided HTML content. """ parser = self.Parser(self) parser.parse(html) def add_article(self, art): self.get_citation_data(art) self.articles.append(art) def clear_articles(self): """Clears any existing articles stored from previous queries.""" self.articles = [] def save_cookies(self): """ This stores the latest cookies we're using to disk, for reuse in a later session. """ if ScholarConf.COOKIE_JAR_FILE is None: return False try: self.cjar.save(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log('info', 'saved cookies file') return True except Exception as msg: ScholarUtils.log('warn', 'could not save cookies file: %s' % msg) return False def _get_http_response(self, url, log_msg=None, err_msg=None): """ Helper method, sends HTTP request and returns response payload. """ if log_msg is None: log_msg = 'HTTP response data follow' if err_msg is None: err_msg = 'request failed' try: ScholarUtils.log('info', 'requesting %s' % url) req = Request(url=url, headers={'User-Agent': ScholarConf.USER_AGENT}) hdl = self.opener.open(req) html = hdl.read() ScholarUtils.log('debug', log_msg) ScholarUtils.log('debug', '>>>>' + '-'*68) ScholarUtils.log('debug', 'url: %s' % hdl.geturl()) ScholarUtils.log('debug', 'result: %s' % hdl.getcode()) ScholarUtils.log('debug', 'headers:\n' + str(hdl.info())) ScholarUtils.log('debug', 'data:\n' + html) ScholarUtils.log('debug', '<<<<' + '-'*68) return html except Exception as err: ScholarUtils.log('info', err_msg + ': %s' % err) return None
class PowerSchool: """This class manages cookies for accessing PowerSchool, as well as providing facilities for retrieving pages.""" def __init__(self,host=DEFAULT_HOST,cookiejar=None,debug=False): """Params: host: the protocol, hostname, and port (without a trailing slash) that is the root of the PowerSchool url. cookiejar: An http.cookiejar.CookieJar or subclass. If a FileCookieJar, cookies will be saved after every request. debug: sets verbose mode""" self.DEBUG = debug self.host = host self.setCookieJar(cookiejar) def setCookieJar(self,cookiejar): """Changes the CookieJar used to manage the session. Existing cookies will not be transferred. Returns: the old CookieJar""" tmpcookies = getattr(self,"cookies",None) if type(cookiejar) == str: self.cookies = MozillaCookieJar(cookiejar) if os.path.exists(cookiejar): self.cookies.load(ignore_discard=True) else: self.cookies = cookiejar self.opener = build_opener(HTTPCookieProcessor(self.cookies)) return tmpcookies def _get_page(self,url,data=None): start = time.time() page = (self.opener.open(url,urlencode(data).encode()) if data else self.opener.open(url)) if self.DEBUG: print("Request time: {}".format(time.time()-start)) if hasattr(self.cookies,"save"): self.cookies.save(ignore_discard=True) return page def _read_page(self,url,data=None): self.__last_page = self._get_page(url,data).read().decode() if self.DEBUG: fd = open("/tmp/pschool-debug-temp.html","w") fd.write(self.__last_page) fd.close() return self.__last_page def _get_url(self,url): return self.host + (url if url.startswith("/") else "/"+url) def _check_for_logout(self): if self.__last_page.find("Student and Parent Sign In") > -1: raise LoggedOut() def login(self,username,password): """Login to a PowerSchool session using the supplied credentials.""" data = self._read_page(self._get_url("/public/")) form = dict(re.findall(r'<input .+?name="(.+?)".+?value="(.*?)".+?>', data, re.MULTILINE|re.IGNORECASE)) form["account"] = username form["ldappassword"] = password pskey = form["contextData"].encode() password = password.encode() b64pw = b64encode(md5(password).digest()).decode().rstrip("=") form["pw"] = hmac.new(pskey,b64pw.encode()).hexdigest() form["dbpw"] = hmac.new(pskey,password.lower()).hexdigest() self._read_page(self._get_url("/guardian/home.html"),form) try: self._check_for_logout() except LoggedOut: raise InvalidCredentials def get(self,page="Main",args=(),**kwargs): """Retrieves data for and constructs the supplied Page class.""" if type(page) == str: page = getattr(pages,page,None) if not page: raise TypeError("Invalid page") data = self._read_page(self._get_url(page.get_url(*args,**kwargs))) self._check_for_logout() return page(data,self,(args,kwargs))
class ScholarQuerier(object): """ ScholarQuerier instances can conduct a search on Google Scholar with subsequent parsing of the resulting HTML content. The articles found are collected in the articles member, a list of ScholarArticle instances. """ # Default URLs for visiting and submitting Settings pane, as of 3/14 GET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + "/scholar_settings?" + "sciifh=1&hl=en&as_sdt=0,5" SET_SETTINGS_URL = ( ScholarConf.SCHOLAR_SITE + "/scholar_setprefs?" + "q=" + "&scisig=%(scisig)s" + "&inststart=0" + "&as_sdt=1,5" + "&as_sdtp=" + "&num=%(num)s" + "&scis=%(scis)s" + "%(scisf)s" + "&hl=en&lang=all&instq=&inst=569367360547434339&save=" ) # Older URLs: # ScholarConf.SCHOLAR_SITE + '/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on class Parser(ScholarArticleParser120726): def __init__(self, querier): ScholarArticleParser120726.__init__(self) self.querier = querier def handle_num_results(self, num_results): if self.querier is not None and self.querier.query is not None: self.querier.query["num_results"] = num_results def handle_article(self, art): self.querier.add_article(art) def __init__(self): self.articles = [] self.query = None self.cjar = MozillaCookieJar() # If we have a cookie file, load it: if ScholarConf.COOKIE_JAR_FILE and os.path.exists(ScholarConf.COOKIE_JAR_FILE): try: self.cjar.load(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log("info", "loaded cookies file") except Exception as msg: ScholarUtils.log("warn", "could not load cookies file: %s" % msg) self.cjar = MozillaCookieJar() # Just to be safe self.opener = build_opener(HTTPCookieProcessor(self.cjar)) self.settings = None # Last settings object, if any def apply_settings(self, settings): """ Applies settings as provided by a ScholarSettings instance. """ if settings is None or not settings.is_configured(): return True self.settings = settings # This is a bit of work. We need to actually retrieve the # contents of the Settings pane HTML in order to extract # hidden fields before we can compose the query for updating # the settings. html = self._get_http_response( url=self.GET_SETTINGS_URL, log_msg="dump of settings form HTML", err_msg="requesting settings failed" ) if html is None: return False # Now parse the required stuff out of the form. We require the # "scisig" token to make the upload of our settings acceptable # to Google. soup = BeautifulSoup(html) tag = soup.find(name="form", attrs={"id": "gs_settings_form"}) if tag is None: ScholarUtils.log("info", "parsing settings failed: no form") return False tag = tag.find("input", attrs={"type": "hidden", "name": "scisig"}) if tag is None: ScholarUtils.log("info", "parsing settings failed: scisig") return False urlargs = {"scisig": tag["value"], "num": settings.per_page_results, "scis": "no", "scisf": ""} if settings.citform != 0: urlargs["scis"] = "yes" urlargs["scisf"] = "&scisf=%d" % settings.citform html = self._get_http_response( url=self.SET_SETTINGS_URL % urlargs, log_msg="dump of settings result HTML", err_msg="applying setttings failed", ) if html is None: return False ScholarUtils.log("info", "settings applied") return True def send_query(self, query): """ This method initiates a search query (a ScholarQuery instance) with subsequent parsing of the response. """ self.clear_articles() self.query = query html = self._get_http_response( url=query.get_url(), log_msg="dump of query response HTML", err_msg="results retrieval failed" ) if html is None: return self.parse(html) def get_citation_data(self, article): """ Given an article, retrieves citation link. Note, this requires that you adjusted the settings to tell Google Scholar to actually provide this information, *prior* to retrieving the article. """ if article["url_citation"] is None: return False if article.citation_data is not None: return True ScholarUtils.log("info", "retrieving citation export data") data = self._get_http_response( url=article["url_citation"], log_msg="citation data response", err_msg="requesting citation data failed" ) if data is None: return False article.set_citation_data(data) return True def parse(self, html): """ This method allows parsing of provided HTML content. """ parser = self.Parser(self) parser.parse(html) def add_article(self, art): self.get_citation_data(art) self.articles.append(art) def clear_articles(self): """Clears any existing articles stored from previous queries.""" self.articles = [] def save_cookies(self): """ This stores the latest cookies we're using to disk, for reuse in a later session. """ if ScholarConf.COOKIE_JAR_FILE is None: return False try: self.cjar.save(ScholarConf.COOKIE_JAR_FILE, ignore_discard=True) ScholarUtils.log("info", "saved cookies file") return True except Exception as msg: ScholarUtils.log("warn", "could not save cookies file: %s" % msg) return False def _get_http_response(self, url, log_msg=None, err_msg=None): """ Helper method, sends HTTP request and returns response payload. """ if log_msg is None: log_msg = "HTTP response data follow" if err_msg is None: err_msg = "request failed" try: ScholarUtils.log("info", "requesting %s" % unquote(url)) req = Request(url=url, headers={"User-Agent": ScholarConf.USER_AGENT}) hdl = self.opener.open(req) html = hdl.read() ScholarUtils.log("debug", log_msg) ScholarUtils.log("debug", ">>>>" + "-" * 68) ScholarUtils.log("debug", "url: %s" % hdl.geturl()) ScholarUtils.log("debug", "result: %s" % hdl.getcode()) ScholarUtils.log("debug", "headers:\n" + str(hdl.info())) ScholarUtils.log("debug", "data:\n" + html.decode("utf-8")) # For Python 3 ScholarUtils.log("debug", "<<<<" + "-" * 68) return html except Exception as err: ScholarUtils.log("info", err_msg + ": %s" % err) return None