def EUN(Username, Password): #Logging in br = spynner.Browser() br.load("http://www.bits-pilani.ac.in:12349/Login.aspx") br.wk_fill('input[name="TextBox1"]', Username) br.wk_fill('input[name="TextBox2"]', Password) br.click("input[type=submit]", wait_load=True, wait_requests=None, timeout=None) #Password chck if str(br.url ) == "http://www.bits-pilani.ac.in:12349/Student/StudentHome.aspx": #Loading the arrears page br.load("http://www.bits-pilani.ac.in:12349/Student/Dues.aspx") br.click("input[name=Button2]", wait_load=True, wait_requests=None, timeout=None) #Creating a soup object plain_text = str(br.html) soup = BeautifulSoup(plain_text) #Extraction of data tableData = soup.find("table", attrs={"id": "arrearGridView"}) cells = tableData.findAll('td') #Printing the dues for item in cells[-1]: print "Your dues : " + item.string else: print "Entered Username and Password do not match" br.close()
def __init__(self, lrules, lua, lh, ljq): self.rules = lrules self.browser = spynner.Browser(user_agent=lua, ignore_ssl_errors=False, headers=lh) self.browser.load_jquery(ljq) self.browser.set_url_filter(self.url_filter_ext)
def open(self, url0): html_body = "" try: self.browser = spynner.Browser() #self.browser.show() #h_heads = [("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1")] #h_heads = [("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36")] # 加载页面, 超时时长120s #self.browser.load(url=url0, load_timeout=120, headers=h_heads) self.browser.load(url=url0, load_timeout=120) #将页面滚动条拖到底部 #js="var q=document.documentElement.scrollTop=10000" #self.browser.runjs(js) #self.browser.wait(15) #self.browser.wait_load(15) html_body = str(self.browser.html) except: # 下载动态网页失败 html_body = "" finally: self.browser.close() return html_body
def process_request(self, request, spider): normal_id = spider.normal_id try: if type(request) is not FormRequest: browser = spynner.Browser() browser.create_webview() browser.set_html_parser(pyquery.PyQuery) browser.hide() browser.load(request.url, load_timeout=50, tries=3) html = browser.html html = html.encode('utf-8') body = str(html) return HtmlResponse(url=request.url, body=body) except spynner.SpynnerTimeout: print '超时%s' % request.url self.col.update({'normal_id': normal_id}, {'$set': { 'state': 'error' }}) except Exception as e: print e.message self.col.update({'normal_id': normal_id}, {'$set': { 'state': 'error' }})
def __init__(self): # 浏览器对象 agent = comm.random_useragent.getRandomUAItem() # self.m_browser = spynner.Browser() self.m_browser = spynner.Browser(user_agent=agent) # self.m_browser.set_proxy("58.52.201.119:8080") self.m_browser.hide() # self.m_browser.show() # 创建数据库对象 self.db_oper = db_helper_class(conf.db_conf) # 分页大小 self.page_size = 20 # 读取偏移值 self.r_offset = 0 # 当前进度 self.curr_prog = 0 # 总共丢弃记录数 self.drop_count = 0 # 应用:康爱多天猫全量商品详情 #TODO: # self.app_id = conf.app_conf.app_tmall_all_products_detail # self.class_id = conf.class_conf.cls_tmall_all_products_detail self.app_id = 999 self.class_id = 999
def __init__(self, num_placa): """ Constructor """ browser = spynner.Browser( user_agent= 'Mozilla/5.0 (X11; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1' ) browser.create_webview() #poner placa en mayuscula self.placa = num_placa.upper() self.headers = { 'Host': 'soaprd.sbs.gob.ec:7778', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Language': 'es-ES,es;q=0.8' } index_url = 'http://soaprd.sbs.gob.ec:7778/AppWGP/sbs_soat_index.jsp' browser.load(index_url) gen_principal_url = 'http://soaprd.sbs.gob.ec:7778/AppWGP/sbs_gen_principal.jsp' browser.load(gen_principal_url) #la url de inicio self.url = 'http://soaprd.sbs.gob.ec:7778/AppWGP/sparametrosappgen' #variables get a pasar a la URL variables = "hid_codSoftware=110&hid_codReporte=6&hid_target=centroUP&txt_q_placa=%s" % self.placa datos_headers = dict( self.headers, **{ 'Origin': 'http://soaprd.sbs.gob.ec:7778', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://soaprd.sbs.gob.ec:7778/AppSoat/sbs_soat_ctrConsulta.jsp?hid_codSoftware=110&hid_codReporte=6&hid_target=centroUP&hid_soporteExcel=S&COD_SOFTWARE=110&COD_UNIDAD_SUBUNIDAD=SRT&NOM_UNIDAD_SUBUNIDAD=SUBDIRECCION%20DE%20RECURSOS%20TECNOL%D3GICOS&COD_EMP=1747&PORTAL_USER=PCARGUA&STS_PERMITE_I=N&STS_PERMITE_D=N&STS_PERMITE_U=N&STS_PERMITE_S=S&COD_OPCION=393' }) req = QNetworkRequest(QUrl(self.url)) for k, v in datos_headers.items(): req.setRawHeader(k, v) browser.webframe.load(req, QNetworkAccessManager.PostOperation, variables) browser.wait_load() datos = unicode(browser.webframe.toHtml()) browser.close() self.parse_data(datos)
def main(): browser = spynner.Browser() browser.load(url) #while 'ORIGIN' not in browser.html: browser.wait(3) #browser.wait_load(3) html = browser.html #web.store_content_in_file(html, '/tmp/spynner.html', overwrite=True) print len(browser.html)
def __init__(self, user_agent=None, **kwargs): try: import spynner except ImportError: raise DependencyNotInstalledError('spynner') if user_agent is None: user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)' self.br = spynner.Browser(user_agent=user_agent, **kwargs)
def __init__(self): comm.PLog.Log("运行实例:天猫处方药列表") agent = comm.random_useragent.getRandomUAItem() self.browser = spynner.Browser(user_agent=agent) # 设置代理 # browser.set_proxy('http://219.133.31.120:8888') self.browser.hide() self.db_oper = comm.db_helper.db_helper_class(conf.db_conf) # 真实得到的目标链接数量 self.target_link_cnt = 0
def get_spynner(self): if not self._spynner: try: import spynner except ImportError: raise SkipTest( "Spynner must be installed if you want to use it") self._spynner = spynner.Browser() return self._spynner
def process_request(self, request, spider): browser = spynner.Browser() browser.create_webview() browser.set_html_parser(pyquery.PyQuery) browser.load(request.url, 20) try: browser.wait_load(10) except: pass return HtmlResponse(request.url, body=str(self.fixCharset(browser.html)))
def getCookie(self): from time import sleep import re self.errmsg = "" tid = "" self.cookie = "" self.cookie_photo = "" browser = spynner.Browser(debug_level=spynner.DEBUG, debug_stream=debug_stream) try: ######## 로그인 하자~~~ browser.load("http://www.cyworld.com/cymain/?f=cymain") browser.load_jquery(force=True) browser.fill('input[name="ID"]', self.email) browser.fill('input[name="PASSWD"]', self.passwd) if self.debug: open("beforeClick.html","w").write(browser.html.encode("mbcs")) browser.click("input[name=btnLOGIN]") browser.wait(self.PAGE_LOAD_WAIT_TIME) if self.debug: open("afterLogin.html","w").write(browser.html.encode("mbcs")) # 로그인 실패 여부 체크 if self.email not in browser.html: self.errmsg = "이메일 혹은 비밀번호가 일치하지 않습니다. 다시 확인해 주세요." return elif self.debug: print "cookcook.getCookie: 이메일/비번 오류없음" c = browser.get_cookies() self.tid = self.getTidFromCookie(c) if self.debug: open("cookie.txt", "w").write(c) #=============================================================== #self.tid = "21251087" #=============================================================== # 사진첩 게시판 로딩후 쿠기 얻기 browser.load("http://minihp.cyworld.com/svcs/MiniHp.cy/index/%s?tid=%s&urlstr=phot" % (self.tid, self.tid)) browser.wait(self.PAGE_LOAD_WAIT_TIME) c = browser.get_cookies() self.cookie_photo = self.make_cookie_photo(c) if self.debug: open("cookie_photo.txt", "w").write(self.cookie_photo) except Exception, msg: if self.debug: print "Exception: ", msg traceback.print_exc(file=sys.stdout) self.errmsg = "로그인에 문제가 있습니다. 인터넷 연결 상태 및 로그인 이메일과 비밀번호를 다시 확인하세요...." return
def gcta_spider(cancertype, tierclass): browser = spynner.Browser() #browser.show() browser.hide() try: browser.load(url='http://54.84.12.177/PanCanFusV2/Fusions!cancerType') except spynner.SpynnerTimeout: print 'Timeout.' else: # 输入搜索关键字 # browser.wk_fill('select[id="cancerType"]', 'BRCA') browser.wk_select('[id="cancerType"]', cancertype) # browser.wk_fill('select[id="tier"]', 'tier1') browser.wk_select('[id="tier"]', tierclass) # 点击搜索按钮,并等待页面加载完毕 browser.wk_click('input[type="submit"]', wait_load=True) # 获取页面的HTML html = browser.html # get total pages pageNum = getNumOfPagesFromHtml(html) fusionGenePairs = [] # first page if pageNum > 0: p = 1 print 'processing page %d of %d' % (p, pageNum) fusionGenePairs = extractFusionGenePairsFromHtml(html) # second to last page if pageNum > 1: for i in xrange(1, pageNum): try: browser.wk_click('[id="fusions_next"]') html = browser.html tmp = extractFusionGenePairsFromHtml(html) fusionGenePairs.extend(tmp) tmp = [] p = i + 1 print 'processing page %d of %d' % (p, pageNum) except: print 'failed to click next page' break else: continue browser.close() return fusionGenePairs
def process_request(self, request, spider): browser = spynner.Browser() browser.create_webview() browser.set_html_parser(pyquery.PyQuery) browser.load(request.url, 20) try: browser.wait_load(10) except: pass string = browser.html string = string.encode('utf-8') renderedBody = str(string) return HtmlResponse(request.url, body=renderedBody)
def __init__(self): self.m_browser = spynner.Browser() self.m_browser.hide() # 数据库对象 self.db_oper = db_helper_class(conf.db_conf) # 分页大小 self.page_size = 20 # 读取偏移值 self.r_offset = 0 self.app_id = conf.app_conf.app_360haoyao_id self.class_id = conf.class_conf.class_360haoyao_all
def process_request(self, request, spider): if request.url.find("nuomi") != -1: browser = spynner.Browser() browser.create_webview() browser.set_html_parser(pyquery.PyQuery) # browser.load(request.url, 20) browser.load(url=request.url, load_timeout=120, tries=1) try: browser.wait_load(1) except: pass html = str(browser.webframe.toHtml().toUtf8()) renderedBody = str(html) return HtmlResponse(request.url, body=renderedBody)
def run(self): while True: n, site = self.queue.get() url = site.strip() result_path = url result_path = result_path.split("/")[-1] zip_path = result_path + ".zip" zip_file = os.path.join(out_dir, zip_path) result_path = result_path + ".html" result_file = os.path.join(out_dir, result_path) if os.path.exists(zip_file): self.out_queue.put((zip_file, 0)) self.queue.task_done() continue # creating and closing browser is wasteful but guarantees no # memory issues browser = spynner.Browser() browser.create_webview(True) try: browser.load(url, load_timeout=20) except spynner.browser.SpynnerTimeout: print("Load timeout reading %i, %s" % (n, url)) print("Scraping %i, %s" % (n, url)) try: f = open(result_file, mode="w") read = browser._get_html() f.writelines(read) f.close() zf = zipfile.ZipFile(zip_file, mode='w') zf.write(result_file, compress_type=compression) os.remove(result_file) zf.close() self.out_queue.put((zip_file, 0)) except TypeError: print("Error reading %i, %s" % (n, url)) self.out_queue.put((zip_file, 2)) except spynner.browser.SpynnerTimeout: print("Timeout reading %i, %s" % (n, url)) self.out_queue.put((zip_file, 3)) except: print("Unknown error reading %i, %s" % (n, url)) self.out_queue.put((zip_file, 4)) browser.close() del browser self.queue.task_done()
def process_request(self, request, spider): # if spider.name in settings.WEBKIT_DOWNLOADER: # if( type(request) is not FormRequest ): browser = spynner.Browser() browser.create_webview() browser.set_html_parser(pyquery.PyQuery) browser.load(request.url, 20) try: browser.wait_load(10) except: pass string = browser.html string = string.encode('utf-8') renderedBody = str(string) return HtmlResponse(request.url, body=renderedBody)
def get_gzh_url(gzh_name): browser = spynner.Browser() browser.show() wurl = unicode('http://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&_sug_=n&_sug_type_='.format(gzh_name)) try: browser.load(url=wurl) except spynner.SpynnerTimeout: print 'Timeout.' else: html = browser.html soup = BeautifulSoup(html) info = soup.find(id="sogou_vr_11002301_box_0") gzh_url = info.get('href') browser.close() return gzh_url
def loadPage(url): # get browser object #browser = spynner.Browser(debug_level = spynner.DEBUG) browser = spynner.Browser() # create browser window browser.create_webview() browser.show() # load login page try: browser.load_jquery(True) browser.load(url) return browser except SpynnerTimeout: print("could not load page") return
def process_request(self, request, spider): browser = spynner.Browser() #if 'Cookie' in request.headers.keys(): # browser.set_cookies(request.headers.Cookie) browser.create_webview() browser.set_html_parser(pyquery.PyQuery) browser.load(request.url, 300) try: browser.wait_load(10) except: pass string = browser.html.encode('utf-8') renderedBody = str(string) browser.close() #return HtmlResponse(request.url,Cookies=browser.cookies,body=renderedBody) return HtmlResponse(request.url, body=renderedBody)
def run(self): url = os.path.join(common.URL, "_cnc/channelclient") browser = spynner.Browser() browser.create_webview(True) browser.load(url, load_timeout=30, tries=True) browser.set_javascript_prompt_callback(self._message_listener) while self.kill == False: if self.token and not self.connected and self.connecting: browser.runjs("openChannel('" + self.token + "')") self.connected = True self.connecting = False if self.token == None and self.connected: browser.runjs("closeSocket()") browser.wait(1) browser.close()
def parse(self, response): try: browser = spynner.Browser() browser.show() try: browser.load(response.url, load_timeout=60, tries=3) #登录页面 except spynner.SpynnerTimeout: print 'download %s timeout' % response.url self.col.update({'vendor': self.vendor}, {'$set': { 'state': 'error' }}) else: browser.wk_fill('input[id="modlgn_username"]', 'lowseasonwind') #填充用户名和密码 browser.wk_fill('input[id="modlgn_passwd"]', 'zhuimeng7') browser.wait(3) browser.runjs("document.getElementById('form-login').submit();" ) #提交form表单 browser.wait(5) try: browser.load( 'http://www.kingview.com/downloads/software.html' ) #登陆后加载软件下载页面 except spynner.SpynnerTimeout: print 'download %s timeout' % 'http://www.kingview.com/downloads/software.html' self.col.update({'vendor': self.vendor}, {'$set': { 'state': 'error' }}) else: print 'goto software page %s' % browser.url body = browser.html body = str(body) return self.parse_item( HtmlResponse( url= 'http://www.kingview.com/downloads/software.html', body=body)) #这里必须用return,不能用yield,否则会报错,其次必须修改spynner browser.py477行,否则会乱码 except Exception as e: self.col.update({'vendor': self.vendor}, {'$set': { 'state': 'error' }})
def get_articles_url(gzh_url): a_urls = [] base_url = "http://mp.weixin.qq.com" browser = spynner.Browser() browser.show() try: browser.load(url=gzh_url) except spynner.SpynnerTimeout: print 'Timeout.' else: html = browser.html soup = BeautifulSoup(html) for link in soup.findAll("h4"): f_url = link.get('hrefs') f_url = base_url + f_url a_urls.append(f_url) browser.close() return a_urls
def load_html(self): """ load html using spynner """ # browser = spynner.Browser() # browser.hide() # browser.show() try: browser.load(self.url, load_timeout=300) browser.wait(self.wait) html = browser.html except spynner.SpynnerTimeout: html = None else: html = browser.html browser.close() return html
def main(): try: br = spynner.Browser() status, list_of_versions = get_app_versions(557137623, b) # Angry Birds Star Wars #status, list_of_versions = get_app_versions(284882215, br) # Facebook #status, list_of_versions = get_app_versions(310633997, br) # Whatsapp if list_of_versions: for version in list_of_versions: print '_id:', version['_id'] print 'App ID:', version['app_id'] print 'Date:', version['date'] print 'Unixtime:', version['unixtime'] print 'Number:', version['number'] for update in version['updates']: print '-', update print '' # End of for loop. # End of if statement. finally: br.close()
def __init__(self): # 浏览器对象 agent = comm.random_useragent.getRandomUAItem() # self.m_browser = spynner.Browser() self.m_browser = spynner.Browser(user_agent=agent) self.m_browser.hide() # self.m_browser.show() self.db_oper = db_helper_class(conf.db_conf) # 分页大小 self.page_size = 20 # 读取偏移值 self.r_offset = 0 # 当前进度 self.curr_prog = 0 # 总共丢弃记录数 self.drop_count = 0 # 应用:天猫保健品详情 self.app_id = conf.app_conf.app_tmall_health_prods_detail self.class_id = conf.class_conf.cls_tmall_health_prods_detail
def browse(url,spynner_browser_timeout,proxy=None): """ 模拟浏览器访问url地址,返回html源文 :param url: url地址 :param spynner_browser_timeout: 超时时间 :param proxy: 代理地址 :return: html源文 """ urlpret = urlparse.urlparse(url) browser = spynner.Browser( user_agent="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", headers=[("Referer", '%s://%s' % (urlpret.scheme, urlpret.netloc))]) if proxy !=None: browser.set_proxy(proxy) browser.load(url=url, load_timeout=spynner_browser_timeout) html=browser.html browser.close() return html
def __init__(self): # 浏览器对象 agent = comm.random_useragent.getRandomUAItem() # self.m_browser = spynner.Browser() self.m_browser = spynner.Browser(user_agent=agent) self.m_browser.hide() # self.m_browser.show() # 数据库对象 self.m_db_obj = comm.db_helper.db_helper_class(conf.db_conf) # 清理会话 self.clear_session() #作业标识 #TODO: # self.app_id = conf.app_conf.app_tmall_all_products_list # self.class_id = conf.class_conf.cls_tmall_all_products_list self.app_id = 999 self.class_id = 999 self.job_id = "%s_%s_%s" % (time.strftime('T%Y%m%d%H%M'), self.app_id, self.class_id) self.job_id = "T201612311000_106_800010"
def test(self): IMG = self.img URL = self.url assert self.proxyg is not None, "no global proxy set" assert self.proxyd is not None, "no download proxy set" br = self.browser = spynner.Browser(ignore_ssl_errors=False, user_agent=self.user_agent, debug_level=spynner.WARNING, debug_stream=sys.stderr) br.show() data, content = {}, {} # no proxy data['noproxy'] = br.download(IMG) br.load(URL, None) content['noproxy'] = br.html # no proxy - alt1 br.set_proxy("") data["proxy_void"] = br.download(IMG) br.load(URL, None) content["proxy_void"] = br.html # no proxy - alt2 br.set_proxy(None) data["proxy_none"] = br.download(IMG) br.load(URL, None) content["proxy_none"] = br.html # global proxy br.set_proxy(self.proxyg) data["proxy_g"] = br.download(IMG) br.load(URL, None) content["proxy_g"] = br.html # use a proxy only @ download level br.load(URL) data["proxy_d"] = br.download(IMG, proxy_url=self.proxyd) for i in data: if data["noproxy"] != data[i]: raise Exception("Download failed for %s" % i)