def login(): global b b = Browser(driver_name="chrome") for i in range(0,3): b.visit(gift_url) b.find_by_id("ptLoginBtn").click() sleep(1) with b.get_iframe('loginFrame') as iframe: iframe.find_by_id('u').fill(username[i]) iframe.find_by_id('p').fill(passwd[i]) iframe.find_by_id('go').click() sleep(1) b.find_by_tag("a")[2].click() sleep(1) #大区,需要改成自己的,这是大地飞鹰 b.find_by_xpath('//select[@id="area1ContentId_wuxia"]/option[@value="7609516"]')._element.click() sleep(1) #服务器,需要改成自己的,这是藏锋谷 b.find_by_xpath('//select[@id="areaContentId_wuxia"]/option[@value="2002"]')._element.click() sleep(1) b.find_by_id("confirmButtonId_wuxia").click() sleep(1) b.get_alert().dismiss() sleep(1) b.find_by_id("ptLogoutBtn").click() sleep(5) print u"领取完毕" sleep(3)
class metaCatcher: def __init__(self): self.browser = Browser() self.browser.driver.set_page_load_timeout(5) def set_url(self, url): self.url = url def download(self, url, name): if os.path.exists(name): return result = requests.get(url) if '</html>' in result.content: return output = open(name, 'w') output.write(result.content) output.close() def catch(self): self.browser.visit(self.url) items = self.browser.find_by_css('.dataset-heading') tmpitems = [] setName = [] for item in items: href = item.find_by_tag('a') setName.append(href[0].text) tmpitems.append(href[0]['href']) print setName i = -1 for href in tmpitems: i += 1 try: self.browser.visit(href) except TimeoutException: pass groups = self.browser.find_by_tag('a') for g in groups: #g.text downloadurl = str(g['href']) if g.text == 'Download Metadata': #print downloadurl #print downloadurl name = setName[i] self.download(downloadurl, './meta/Safety1/' + name + '.json')
def enterprise(self): driver = Browser(driver_name=BROWSER['SPLINTER']['NAME'], executable_path=BROWSER['SPLINTER']['PATH'], headless=True) driver.visit(self.text) fiscal = False n = 0 items = driver.find_by_tag('strong') for seq, item in enumerate(items): n += 1 if not item.text == '财报信息': if seq == 0: content = '全球企业动态[%s]\n\n' % datetime.now().date() elif seq == 1: content += '概要:\n%s\n' % item.text else: content += '\n%d. %s\n' % (seq-1, item.text) else: fiscal = True break if fiscal: content += '\n财报信息:\n' + driver.find_by_xpath('//*[@id="js_content"]/p[%s]' % str(n*2)).text content += '\n\n' + self.text return content
NOTE: you maybe want to change the webdriver, " Browser() ", and add your prefered driver, Browser('webdriver.chrome') for example, but by default is always set as webdriver.firefox NOTE: choose one url that contains broken links to see the response NOTE: this is a basic code, you can improve it and do what you want, believe, you could do almost everything :) More information, see the docs: http://splinter.cobrateam.info/docs/ """ from splinter.browser import Browser from splinter.request_handler.status_code import HttpResponseError browser = Browser() # Visit URL url = "http://splinter.cobrateam.info/" browser.visit(url) # Get all links in this page urls = [a['href'] for a in browser.find_by_tag('a')] # Visit each one link and verify if is ok for url in urls: try: browser.visit(url) if browser.status_code.is_success(): print '(', browser.status_code.code, ') visit to', url, 'was a success!' except HttpResponseError, e: print '(', e.status_code, ') visit to', url, 'was fail! Error:', e.reason browser.quit()
TAMANHO = B.find_by_css( '.modal-body .item.active .input-group a')[1].click() else: TAMANHO = SELETOR_TAMANHOS[random.randint(0, QUANTIDADE_TAMANHOS - 1)] #Seleciona tamanho TAMANHO.find_by_css( 'a')[1].click() #Clica na tag A que adiciona itens time.sleep(SLEEP) i = i + 1 B.find_by_css('div.modal-footer').click() B.find_by_css('button.btn.btn-primary').click() VARPEDIDOS = VARPEDIDOS + 1 #Selecionar o carrinho time.sleep(SLEEP) B.find_by_tag('span.badge.badge-primary').click() #B.find_by_css('i.glyphicon.glyphicon-shopping-cart').click() """Caso queira acrescentar as quantidades depois de selecionar todos os produtos... #Selecionar modal dos tamanhos time.sleep(SLEEP) B.find_by_css('i.glyphicon.glyphicon-edit').click() #Selecionar quantidade i = 0 while i < QTD: SELETOR_TAMANHOS = B.find_by_css('.modal-body .item.active .input-group') QUANTIDADE_TAMANHOS = len(SELETOR_TAMANHOS) if QUANTIDADE_TAMANHOS == 1: TAMANHO = SELETOR_TAMANHOS.first else: TAMANHO = SELETOR_TAMANHOS[random.randint(0, QUANTIDADE_TAMANHOS-1)] #seleciona tamanho
class DouYin(object): def __init__(self, width=500, height=300): """ 抖音App视频下载 """ # 无头浏览器 chrome_options = Options() chrome_options.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"' ) self.driver = Browser(driver_name='chrome', executable_path='F:/chromedriver_win32', options=chrome_options, headless=True) def get_video_urls(self, user_id): """ 获得视频播放地址 Parameters: user_id:查询的用户ID Returns: video_names: 视频名字列表 video_urls: 视频链接列表 nickname: 用户昵称 """ video_names = [] video_urls = [] unique_id = '' while unique_id != user_id: search_url = 'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622' % user_id req = requests.get(url=search_url, verify=False) html = json.loads(req.text) aweme_count = html['user_list'][0]['user_info']['aweme_count'] uid = html['user_list'][0]['user_info']['uid'] nickname = html['user_list'][0]['user_info']['nickname'] unique_id = html['user_list'][0]['user_info']['unique_id'] user_url = 'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s' % ( uid, aweme_count) req = requests.get(url=user_url, verify=False) html = json.loads(req.text) i = 1 for each in html['aweme_list']: share_desc = each['share_info']['share_desc'] if '抖音-原创音乐短视频社区' == share_desc: video_names.append(str(i) + '.mp4') i += 1 else: video_names.append(share_desc + '.mp4') video_urls.append(each['share_info']['share_url']) return video_names, video_urls, nickname def get_download_url(self, video_url): """ 获得带水印的视频播放地址 Parameters: video_url:带水印的视频播放地址 Returns: download_url: 带水印的视频下载地址 """ req = requests.get(url=video_url, verify=False) bf = BeautifulSoup(req.text, 'lxml') script = bf.find_all('script')[-1] video_url_js = re.findall('var data = \[(.+)\];', str(script))[0] video_html = json.loads(video_url_js) download_url = video_html['video']['play_addr']['url_list'][0] return download_url def video_downloader(self, video_url, video_name, watermark_flag=True): """ 视频下载 Parameters: video_url: 带水印的视频地址 video_name: 视频名 watermark_flag: 是否下载不带水印的视频 Returns: 无 """ size = 0 if watermark_flag == True: video_url = self.remove_watermark(video_url) else: video_url = self.get_download_url(video_url) with closing(requests.get(video_url, stream=True, verify=False)) as response: chunk_size = 1024 content_size = int(response.headers['content-length']) if response.status_code == 200: sys.stdout.write(' [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024)) with open(video_name, "wb") as file: for data in response.iter_content(chunk_size=chunk_size): file.write(data) size += len(data) file.flush() sys.stdout.write(' [下载进度]:%.2f%%' % float(size / content_size * 100) + '\r') sys.stdout.flush() def remove_watermark(self, video_url): """ 获得无水印的视频播放地址 Parameters: video_url: 带水印的视频地址 Returns: 无水印的视频下载地址 """ self.driver.visit('http://douyin.iiilab.com/') self.driver.find_by_tag('input').fill(video_url) self.driver.find_by_xpath('//button[@class="btn btn-default"]').click() html = self.driver.find_by_xpath( '//div[@class="thumbnail"]/div/p')[0].html bf = BeautifulSoup(html, 'lxml') return bf.find('a').get('href') def run(self): """ 运行函数 Parameters: None Returns: None """ self.hello() user_id = input('请输入ID(例如40103580):') video_names, video_urls, nickname = self.get_video_urls(user_id) if nickname not in os.listdir(): os.mkdir(nickname) print('视频下载中:共有%d个作品!\n' % len(video_urls)) for num in range(len(video_urls)): print(' 解析第%d个视频链接 [%s] 中,请稍后!\n' % (num + 1, video_urls[num])) if '\\' in video_names[num]: video_name = video_names[num].replace('\\', '') elif '/' in video_names[num]: video_name = video_names[num].replace('/', '') else: video_name = video_names[num] self.video_downloader(video_urls[num], os.path.join(nickname, video_name)) print('\n') print('下载完成!') def hello(self): """ 打印欢迎界面 Parameters: None Returns: None """ print('*' * 100) print('\t\t\t\t抖音App视频下载小助手') print('\t\t作者:Jack Cui') print('*' * 100)
class CatchTicket(object): def __init__(self): self.train = "G87" # train to be order self.username = u"xxx" # your login info on 12306 self.passwd = u"xxx" # your login info on 12306 self.date = u"2019-01-21" # self.from_station = u'%u5317%u4EAC%u897F%2CBXP' # beijing xi # self.to_station = u'%u897F%u5B89%u5317%2CEAY' # xian bei xian_bei_cookie = u"%u897F%u5B89%u5317%2CEAY" beijing_xi_cookie = u"%u5317%u4EAC%u897F%2CBXP" langfang = u"%u5ECA%u574A%2CLJP" shanghai_hongqiao = u"%u4E0A%u6D77%2CSHH" self.from_station = beijing_xi_cookie self.to_station = xian_bei_cookie self.person = [u"xxx", u"xxx"] # your name here self.login_url = "https://kyfw.12306.cn/otn/login/init" self.login_comp_url = "https://kyfw.12306.cn/otn/view/index.html" self.search_url = "https://kyfw.12306.cn/otn/leftTicket/init" self.order_submit_url = "https://kyfw.12306.cn/otn/confirmPassenger/initDc" self.driver = Browser(driver_name="chrome") self.driver.driver.set_window_size(1400, 1000) def login(self): self.driver.visit(self.login_url) self.driver.fill("loginUserDTO.user_name", self.username) self.driver.fill("userDTO.password", self.passwd) while self.driver.url != self.login_comp_url: print("fill in the certi code yourself...") sleep(1) print("login complete") def get_train_index(self): # get the index of selected train all_div_tag = self.driver.find_by_tag("div") train_index = 0 for div_tag in all_div_tag: div_tag = div_tag.text if "G" in div_tag: # contains train for context in div_tag.split("\n"): if re.search(r"G\d+", context): # find train named G25 if context == self.train: break train_index += 1 break return train_index def start_order(self): self.login() self.driver.visit(self.search_url) self.driver.cookies.add({"_jc_save_fromStation": self.from_station}) self.driver.cookies.add({"_jc_save_toStation": self.to_station}) self.driver.cookies.add({"_jc_save_fromDate": self.date}) self.driver.reload() self.driver.find_by_text(u"GC-高铁/城际").click() for i in range(2): self.driver.find_by_text(u"历时").click() count = 1 while self.driver.url != self.order_submit_url: try: print("Searching for {} time".format(count)) self.driver.find_by_text(u"查询").click() if self.driver.find_by_text(u"网络繁忙"): self.driver.find_by_text(u"确认").click() else: train_index = self.get_train_index() self.driver.find_by_text(u"预订")[train_index].click() sleep(1) count += 1 except Exception as e: print(e) count += 1 continue print("enter order submit page") sleep(1) for name in self.person: self.driver.find_by_text(name).last.click() try: self.driver.find_by_text(u"提交订单").click() # self.driver.find_by_text(u'确认').click() except Exception as e: print(e) print("order complete not tru")
browser = Browser() #As of March 27, 2016 inp = csv.reader(file(fd + 'Complete_list.csv', 'rb')) head = inp.next() for e, i in enumerate(head): print e, i fd2 = 'g:/health_data/provider_urls/' for i in inp: if not re.search("^None|NOT SUBMITTED", i[2]): print i[1] try: outp = csv.writer(open(os.path.join(fd2, i[1] + '.csv'), 'wb'), delimiter='\t') browser.visit(i[2]) sleep(1) try: need = browser.find_by_css('pre') proc = json.loads(need[0].text) except: need = browser.find_by_tag('body') proc = json.loads(re.sub('}.*?$', '}', need[0].text)) for p in proc['provider_urls']: outp.writerow([p]) #call('taskkill /F /IM firefox.exe') except: traceback.print_exc()
class GetInfo(): conn = '' browser = '' def __init__(self, browser, conn=None, to_sql=None): # 数据库存储初始化 表名称、登陆用户名 self.exportSql = to_sql if browser: self.browser = browser self.conn = conn else: try: self.browser = Browser("chrome", headless=False) self.browser.driver.set_window_size(1600, 1000) except Exception as e: self.browser = None # 登陆 def login(self, info): # 账号,密码, account = info.get('account') password = info.get('password') if self.browser.url == 'https://17dz.com/manage/index.html': if account == self.loginName(): return '登陆成功' self.browser.visit('https://17dz.com/home/login.html') # 校验是否为空 if not all([account, password]): return jsonify(errmsg='参数不全') with self.browser.get_iframe('loginIframe') as iframe: iframe.find_by_css('input[id="id__0"]').first.fill(account) iframe.find_by_css('input[id="id__1"]').first.fill(password) iframe.find_by_text('登录').first.click() time.sleep(2) if self.browser.url == 'https://17dz.com/manage/index.html': return '登陆成功' else: return '账号和密码不匹配,请重新输入' def loginName(self): js = '''getloginName=function(){ $.ajax({ type:'GET', url: 'https://17dz.com/xqy-portal-web/manage/login/getLoginSession?_=1544003601263', contentType:'application/json;charset=utf-8', success: function (result) { if(result.success) { top.Id = result; } else { top.Id = result; } } }) }''' self.browser.evaluate_script(js) self.browser.evaluate_script('getloginName()') i = 1 loginName = '' while True: if self.browser.evaluate_script("top.Id"): loginName = self.browser.evaluate_script('top.Id').get( 'body').get('loginName') break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.Id=""') return loginName # 登陆成功后获取账套列表 def getAllzt(self): # self.browser.find_by_text('凭证查看').first.click() import datetime d = datetime.datetime.now() period = datetime.date(d.year - (d.month == 1), d.month - 1 or 12, 1).strftime('%Y') js = '''getCustomerId=function(){ var data = {customerName:"", pageNo:1 , pageSize:"500" , searchType:"ALL"} $.ajax({ type:'POST', url: 'https://17dz.com/xqy-portal-web/manage/customer/queryCustomers', contentType:'application/json;charset=utf-8', data : JSON.stringify(data), success: function (result) { if(result.success) { top.customerId = result; } else { top.customerId = result; } } }) } ''' new_js = '''getCustomerId=function(period){ var data = {"pageNo":1, "pageSize":"500", "period":period, "customerNoOrNameLike":"", "accountCloseStatus":"", "sortField":"", "sortDirection":false } $.ajax({ type:'POST', url: 'https://17dz.com/xqy-portal-web/manage/finance/queryCustomer', contentType:'application/json;charset=utf-8', data : JSON.stringify(data), success: function (result) { if(result.success) { top.customerId = result; } else { top.customerId = result; } } }) }''' self.browser.evaluate_script(new_js) self.browser.evaluate_script('getCustomerId("%s")' % period) i = 1 Id = [] while True: if self.browser.evaluate_script("top.customerId"): Id = self.browser.evaluate_script( 'top.customerId')['body']['list'] break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.customerId=""') customerId = Id[0]['customerId'] js2 = '''getAllzt=function(customerId){ var data = {key: "", customerId: customerId} $.ajax({ type:'POST', url: 'https://17dz.com/xqy-portal-web/manage/workbench/getAccountCustomers', contentType:'application/json;charset=utf-8', data : JSON.stringify(data), success: function (result) { if(result.success) { top.zt_data = result; } else { top.zt_data = result; } } }) }''' self.browser.evaluate_script(js2) self.browser.evaluate_script('getAllzt("%s")' % customerId) i = 1 ztData = {} while True: if self.browser.evaluate_script("top.zt_data"): ztData = self.browser.evaluate_script('top.zt_data').get( 'body', '') break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.zt_data=""') return ztData # 切换账套,得到账套的起始和结束日期 def switchZt(self, params): customerId = params['customerId'] accountSetId = params['accountSetId'] customerName = params['customerName'] customerShortName = params['customerShortName'] # getKhxx('127059881','4320800','上海路卡服装有限公司','上海路卡服装有限公司') js = '''getKhxx=function(customerId,accountSetId,customerName,customerShortName){ $.ajax({ type:'PUT', url:'https://17dz.com/xqy-portal-web/finance/account/session/accountSet', data : {customerId:customerId,accountSetId:accountSetId,customerName:customerName,customerShortName:customerShortName,platform:'yqdz'}, dataType: 'json', success: function (result) { if(result.success) { top.khxx = result; } else { top.khxx = result; } } }) }''' self.browser.evaluate_script(js) self.browser.evaluate_script( 'getKhxx("%s","%s","%s","%s")' % (customerId, accountSetId, customerName, customerShortName)) i = 1 khxx = {} while True: if self.browser.evaluate_script("top.khxx"): khxx = self.browser.evaluate_script('top.khxx') break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.khxx=""') try: startQj = khxx.get('body').get('createPeriod') endQj = khxx.get('body').get('lastPeriod') except Exception as e: return '网络异常,请稍后重试' dateStart = datetime.datetime.strptime(startQj, '%Y%m') dateEnd = datetime.datetime.strptime(endQj, '%Y%m') dates = [] dates.append(dateStart.strftime('%Y%m')) while dateStart <= dateEnd: dateStart += datetime.timedelta(weeks=4) dates.append(dateStart.strftime('%Y%m')) datesList = sorted(list(set(dates))) return datesList # 凭证 def voucher(self, QjList, ztID, infoname): js = '''get_Voucher=function(kjqj_date){ var data = {"beginPeriod":kjqj_date, "endPeriod":kjqj_date, "titleCode":"", "beginNumber":"", "endNumber":"", "beginMoney":"", "endMoney":"", "summary":"", "pageSize":"1000", "pageNo":0 } $.ajax({ type: "POST", url: 'https://17dz.com/xqy-portal-web/finance/accDocs/list', contentType:'application/json;charset=utf-8', data: JSON.stringify(data), success: function (result) { if(result.success) { top.voucher_data = result; } else { top.voucher_data = result; } } }) } ''' self.browser.evaluate_script(js) #创建数据库的infonameID infonameID = self.exportSql.init_infoname(infoname).id try: for Qj in QjList: self.browser.evaluate_script('get_Voucher("%s")' % Qj) i = 1 voucher_data = {} while True: if self.browser.evaluate_script("top.voucher_data"): data = self.browser.evaluate_script('top.voucher_data') if data: voucher_data = data.get('body') break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.voucher_data=""') voucherString = json.dumps(voucher_data) # 保存到数据库 self.exportSql.insert_new(ztID, Qj, infonameID, voucherString) except Exception as e: msg = '凭证导出失败:{}'.format(str(e)) else: msg = '凭证导出成功' return msg # 科目余额表 def kmsheet( self, QjList, ztID, infoname, ): # 创建数据库的infonameID infonameID = self.exportSql.init_infoname(infoname).id try: for Qj in QjList: #获取科目余额 km_data = self.getKMBody(Qj) '''第一版 # #获取数量金额式 # slje_data = self.getKMBody(Qj,"B,S") # #获取外币金额式 # wbje_data = self.getKMBody(Qj,"B,W") # li = {} # li['kmye'] = km_data # li['slje'] = slje_data # li['wbje'] = wbje_data''' # 保存到数据库 kmString = json.dumps(km_data) self.exportSql.insert_new(ztID, Qj, infonameID, kmString) except Exception as e: msg = '科目余额导出失败:{}'.format(str(e)) else: msg = '科目余额导出成功' return msg def getKMBody(self, Qj): js = '''getKMBody=function(kjqj_date){ var data = { "beginPeriod":kjqj_date, "endPeriod":kjqj_date, "beginTitleCode":"", "endTitleCode":"", "pageNo":0, "pageSize":5000, "showYearAccumulated":true, "assistantId":"", "assistantType":"", "showAssistant":true, "titleLevel":6, "showEndBalance0":true, "showQuantity":false, "fcurCode":"" } $.ajax({ type: "POST", url: 'https://17dz.com/xqy-portal-web/finance/accountBalanceSheet/query', contentType:'application/json;charset=utf-8', data: JSON.stringify(data), success: function (result) { if(result.success) { top.KMBody = result; } else { top.KMBody = result; } } }) } ''' self.browser.evaluate_script(js) # 获取科目余额 self.browser.evaluate_script('getKMBody("%s")' % Qj) data_km = {} i = 1 while True: if self.browser.evaluate_script("top.KMBody"): data_km = self.browser.evaluate_script('top.KMBody')['body'] break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.KMBody=""') if Qj == "201601": print(data_km) return data_km # 辅助核算余额表 说明: def fzhssheet(self, QjList, company): js = '''getFzhssheet=function(kjqj_date){ var data = { "assistantType":"c", "beginCode":"", "endCode":"", "beginPeriod":kjqj_date, "endPeriod":kjqj_date, "assistantId":"", "bwsTypeList":"B", "level":"6", "showEmptyBalance":false, "firstAccountTitle":false, "accumulated":true } $.ajax({ type: "POST", url: 'https://17dz.com/xqy-portal-web/finance/assistantBalanceBook/list', contentType:'application/json;charset=utf-8', data: JSON.stringify(data), success: function (result) { if(result.success) { top.fzhs_data = result; } else { top.fzhs_data = result; } } }) }''' self.browser.evaluate_script(js) fzhs_dict = {} for Qj in QjList: self.browser.evaluate_script('getFzhssheet("%s")' % Qj) i = 1 while True: if self.browser.evaluate_script("top.fzhs_data"): fzhsye = self.browser.evaluate_script( 'top.fzhs_data')['body'] break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.fzhs_data=""') slje = self.getFZBody(Qj, "s", "B,S") wbje = self.getFZBody(Qj, "w", "B,W") li = {} li['fzhsye'] = fzhsye li['slje'] = slje li['wbje'] = wbje fzhs_dict[str(Qj)] = li if not fzhs_dict: return '获取辅助核算余额表失败' # 保存到数据库 try: self.exportSql.update_fzsheet(company, fzhs_dict) except Exception as e: return '辅助核算余额表保存失败:%s' % e return '辅助核算余额表导出成功' def getFZBody(self, Qj, balanceType, bwsTypeList): js = '''getFZBody=function(kjqj_date,balanceType,bwsTypeList){ var data = {"beginPeriod":"201811", "endPeriod":"201811", "beginCode":"", "endCode":"", "assistantType":"c", "assistantId":"", "balanceType":balanceType, "ifCondition":false, "bwsTypeList":bwsTypeList, "firstAccountTitle":false, "showEmptyBalance":false, "level":"6", "accumulated":true } $.ajax({ type: "POST", url: 'https://17dz.com/xqy-portal-web/finance/assistantBalanceBook/list', contentType:'application/json;charset=utf-8', data: JSON.stringify(data), success: function (result) { if(result.success) { top.FZBody = result; } else { top.FZBody = result; } } }) }''' self.browser.evaluate_script(js) # 获取科目余额 self.browser.evaluate_script('getFZBody("%s","%s","%s")' % (Qj, balanceType, bwsTypeList)) i = 1 km_data = {} while True: if self.browser.evaluate_script("top.FZBody"): km_data = self.browser.evaluate_script('top.FZBody')['body'] break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.FZBody=""') return km_data #现金流量 def xjll(self, QjList, ztID, infoname): jd_js = '''xjll=function(url){ $.ajax({ type:'GET', url: url, success: function (result) { if(result.success) { top.xjll_data = result; } else { top.xjll_data = result; } } }) }''' # self.browser.evaluate_script(xjll_js) self.browser.evaluate_script('window.open("about:blank")') self.browser.windows.current = self.browser.windows[1] #获取起始季度 jd_url = 'https://17dz.com/xqy-portal-web/finance/cashFlowInitial/queryInitialPeriod?_=1547174631618' self.browser.visit(jd_url) jd_jsonStr = self.browser.find_by_tag('pre').first.text init_jd = json.loads(jd_jsonStr) #月报 # 创建数据库的infonameID Y_infonameID = self.exportSql.init_infoname(infoname + '-月报').id try: for Qj in QjList: time.sleep(0.5) y_url = 'https://17dz.com/xqy-portal-web/finance/cashFlowSheet?accountPeriod={}&sheetType=2&_=1543301545878'.format( Qj) self.browser.visit(y_url) Y_jsonStr = self.browser.find_by_tag('pre').first.text #月报存库 self.exportSql.insert_new(ztID, Qj, Y_infonameID, Y_jsonStr) except Exception as e: msg = '现金流量月报导出失败:{}'.format(str(e)) else: msg = '现金流量月报导出成功' #季报 # 创建数据库的infonameID J_infonameID = self.exportSql.init_infoname(infoname + '-季报').id try: for Qj in QjList: year = Qj[:4] if Qj[4:] in ['01', '02', '03']: jd = '1' elif Qj[4:] in ['04', '05', '06']: jd = '2' elif Qj[4:] in ['07', '08', '09']: jd = '3' elif Qj[4:] in ['10', '11', '12']: jd = '4' time.sleep(0.5) j_url = 'https://17dz.com/xqy-portal-web/finance/cashFlowSheet/quarterlyReport?year={}&season={}&_=1543301545880'.format( year, jd) self.browser.visit(j_url) J_jsonStr = self.browser.find_by_tag('pre').first.text qj = '%s-%s' % (year, jd) # 季报存库 self.exportSql.insert_new(ztID, qj, J_infonameID, J_jsonStr) except Exception as e: msg = '现金流量季报导出失败:{}'.format(str(e)) else: msg = '现金流量季报导出成功' self.browser.windows.current.close() self.browser.windows.current = self.browser.windows[0] return msg # 基础设置 def settings(self, customerId, ztID, accountSetId, QjList, infoname): set_js = '''getSettings=function(url){ $.ajax({ type:'GET', url: url, success: function (result) { if(result.success) { top.load_data = result; } else { top.load_data = result; } } }) }''' self.browser.evaluate_script(set_js) #科目 # 创建数据库的infonameID kmID = self.exportSql.init_infoname(infoname + '-科目').id km_dict = {} # 获取资产,负债,权益,成本,损益对应的编码 code_url = 'https://17dz.com/xqy-portal-web/finance/accountTitle/types?systemAccountId=1&_=1542955356592' AllCodes = self.get_settings(code_url).get('body', []) for i in AllCodes: code = i['code'] name = i['name'] km_url = 'https://17dz.com/xqy-portal-web/finance/customerAccountTitles/' \ 'listByType?customerId={}&subjectType={}&_=1542955356593'.format(customerId,code) res = self.get_settings(km_url) km_dict[name] = res #辅助核算 # 创建数据库的infonameID fzID = self.exportSql.init_infoname(infoname + '-辅助核算').id fz_dict = {} Base_url = 'https://17dz.com/xqy-portal-web/finance/{}/list' \ '/page?key=&accountSetId={}&customerId={}&pageNo=0&pageSize=10000' FZ_List = ['clients', 'suppliers', 'inventories', 'projects'] for name in FZ_List: newUrl = Base_url.format(name, accountSetId, customerId) data = self.get_settings(newUrl).get('body') fz_dict[name] = data # 币别 # 创建数据库的infonameID bbID = self.exportSql.init_infoname(infoname + '-币别').id url = 'https://17dz.com/xqy-portal-web/finance/exchangeRates/all?accountPeriod={}&_=1542955356686' for Qj in QjList: newurl = url.format(Qj) B = self.get_settings(newurl) # 将币别存库 self.exportSql.insert_new(ztID, Qj, bbID, json.dumps(B)) try: # 将科目存库 self.exportSql.insert_new(ztID, '', kmID, json.dumps(km_dict)) # 辅助核算保存入库 self.exportSql.insert_new(ztID, '', fzID, json.dumps(fz_dict)) except Exception as e: return '基础设置导出成功保存失败:{}'.format(str(e)) return '基础设置导出成功' def get_settings(self, url): try: self.browser.evaluate_script('getSettings("%s")' % url) except Exception as e: print(e) i = 1 settings_data = {} while True: if self.browser.evaluate_script("top.load_data"): settings_data = self.browser.evaluate_script('top.load_data') break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.load_data=""') return settings_data #获取现金流量 def get_xjll(self, url): try: self.browser.evaluate_script('xjll("%s")' % url) time.sleep(5.5) except Exception as e: print(e) i = 1 xjll_data = {} while True: if self.browser.evaluate_script("top.xjll_data"): xjll_data = self.browser.evaluate_script('top.xjll_data') break elif i > 5: break else: time.sleep(0.5) i += 1 pass self.browser.evaluate_script('top.xjll_data=""') return xjll_data
and add your prefered driver, Browser('webdriver.chrome') for example, but by default is always set as webdriver.firefox NOTE: choose one url that contains broken links to see the response NOTE: this is a basic code, you can improve it and do what you want, believe, you could do almost everything :) More information, see the docs: http://splinter.cobrateam.info/docs/ """ from splinter.browser import Browser from splinter.request_handler.status_code import HttpResponseError browser = Browser() # Visit URL url = "http://splinter.cobrateam.info/" browser.visit(url) # Get all links in this page urls = [a["href"] for a in browser.find_by_tag("a")] # Visit each one link and verify if is ok for url in urls: try: browser.visit(url) if browser.status_code.is_success(): print "(", browser.status_code.code, ") visit to", url, "was a success!" except HttpResponseError, e: print "(", e.status_code, ") visit to", url, "was fail! Error:", e.reason browser.quit()
class TestEngine(object): __sleep_time = 2 __mouse_over = True __mouse_over_sleep = 1 def __init__(self, browser_name, execute_path=None): if execute_path is None: self.__browser = Browser(browser_name, fullscreen=True) self.__quit = False else: self.__browser = Browser(browser_name, executable_path=execute_path, fullscreen=True) self.__quit = False @staticmethod def set_config(config): TestEngine.__sleep_time = 2 if config.get("sleep_time") is None else config.get("sleep_time") TestEngine.__mouse_over = True if config.get("mouse_over") is None else config.get("mouse_over") TestEngine.__mouse_over_sleep = 1 if config.get("mouse_over_sleep") is None else config.get("mouse_over_sleep") def test_list_acts(self, domain, action_list, back_fun=None, result_back=None): thread_deal = threading.Thread(target=self.__test_list_thread, args=(domain, action_list, back_fun, result_back), name="TestEngine deal tester") thread_deal.start() def test_deal(self, domain, action_obj, back_fun=None, result_back=None): thread_deal = threading.Thread(target=self.__test_do_thread, args=(domain, action_obj, back_fun, result_back), name="TestEngine deal tester") # hasattr(result_back, "__call__") thread_deal.start() def quit(self): self.__quit = True self.__browser.quit() def is_quited(self): return self.__quit def __test_list_thread(self, domain, action_list, back_fun=None, result_back=None): try: for action in action_list: self.__test_do(domain, action, result_back) except Exception as e: raise Exception("[Error code] deal test list failed, error code=", e) finally: if action_list[0].waitClose != 0: sleep(action_list[0].waitClose) if back_fun is None: self.quit() else: back_fun() def __test_do_thread(self, domain, action_obj, back_fun=None, result_back=None): try: self.__test_do(domain, action_obj, result_back) except Exception as e: raise Exception("[Error code] deal test failed, error code=", e) finally: if action_obj.waitClose != 0: sleep(action_obj.waitClose) if back_fun is None: self.quit() else: back_fun() def __test_do(self, domain, action_obj, result_back=None): test_url = domain+action_obj.urlPath self.__browser.visit(test_url) # form表单默认为第一个action循环测试,之后的action按照顺序执行 action_list = TesterActionData().dict_to_list(action_obj.actionList) if action_obj.forms is not None: form_action = action_list[0] if action_list else None forms = TesterForms().dict_to_list(action_obj.forms) for form in forms: params = TesterFormData().dict_to_list(form.params) for param in params: self.__set_value(int(param.formType), param.formElName, param.formElValue.decode("utf-8"), int(param.index)) sleep(TestEngine.__sleep_time) if form_action is not None: self.__deal_action(form_action, result_back) sleep(action_obj.sleepTime) for action_deal in action_list[1:]: self.__deal_action(action_deal, result_back) sleep(action_obj.sleepTime) else: for action_deal in action_list: self.__deal_action(action_deal, result_back) sleep(action_obj.sleepTime) def __set_value(self, form_type, el_name, el_value, index): elements = self.__event_element(form_type, el_name) element = elements[index] if element['type'] in ['text', 'password', 'tel'] or element.tag_name == 'textarea': element.value = el_value elif element['type'] == 'checkbox': if el_value: element.check() else: element.uncheck() elif element['type'] == 'radio': element.click() elif element._element.tag_name == 'select': element.find_by_value(el_value).first._element.click() else: element.value = el_value def __event_element(self, el_type, el_value): ele_type = EL_TYPE.value(el_type) if ele_type == "id": return self.__browser.find_by_id(el_value) elif ele_type == "name": return self.__browser.find_by_name(el_value) elif ele_type == "tag": return self.__browser.find_by_tag(el_value) elif ele_type == "value": return self.__browser.find_by_value(el_value) elif ele_type == "selector": return self.__browser.find_by_xpath(el_value) elif ele_type == "css": return self.__browser.find_by_css(el_value) else: raise ValueError("Test Engine can't deal the element type:%s, el_type:%s", ele_type, el_type) def __deal_action(self, action_data, result_back=None): action_type = ACTION_TYPE.value(action_data.action) # 当页面跳转是抓取最后一个打开的窗口页面 self.__browser.windows.current = self.__browser.windows[-1] if action_type == "click": self.__mouse_of_click(self.__event_element(action_data.elType, action_data.elValue)[int(action_data.index)]) elif action_type == "double click": self.__mouse_of_double_click(self.__event_element(action_data.elType, action_data.elValue)[int(action_data.index)]) elif action_type == "right click": self.__mouse_of_right_click(self.__event_element(action_data.elType, action_data.elValue)[int(action_data.index)]) elif action_type == "mouse over": self.__event_element(action_data.elType, action_data.elValue)[int(action_data.index)].mouse_over() elif action_type == "mouse out": self.__event_element(action_data.elType, action_data.elValue)[int(action_data.index)].mouse_out() elif action_type == "select": self.__event_element(action_data.elType, action_data.elValue)[int(action_data.index)].select() else: raise Exception("don't find action for action:%s", action_data.action) try: if action_data.testerResult is not None and result_back is not None: sleep(3) result_back(TesterResult(action_data.testerResult, self.__browser.is_text_present(action_data.testerResult))) except Exception: result_back(TesterResult(action_data.testerResult, False)) def __mouse_of_click(self, event_deal_obj): if TestEngine.__mouse_over: event_deal_obj.mouse_over() sleep(TestEngine.__mouse_over_sleep) event_deal_obj.click() else: event_deal_obj.click() def __mouse_of_right_click(self, event_deal_obj): if TestEngine.__mouse_over: event_deal_obj.mouse_over() sleep(TestEngine.__mouse_over_sleep) event_deal_obj.right_click() else: event_deal_obj.click() def __mouse_of_double_click(self, event_deal_obj): if TestEngine.__mouse_over: event_deal_obj.mouse_over() sleep(TestEngine.__mouse_over_sleep) event_deal_obj.double_click() else: event_deal_obj.click()
class LemonLemon_douyin(object): def __init__(self, width=500, height=300): """ 抖音App视频下载 """ # 无头浏览器 chrome_options = Options() chrome_options.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"' ) self.driver = Browser(driver_name="chrome", options=chrome_options, headless=True) def get_video_urls(self, input_f): """ 获得视频播放地址 Parameters: user_id:查询的用户ID Returns: video_names: 视频名字列表 video_urls: 视频链接列表 nickname: 用户昵称 """ video_names = [] video_urls = [] i = 1 now_date = datetime.datetime.now() self.date_today = (str(now_date.date()) + "_" + str(now_date.hour) + ":" + str(now_date.minute)) # unique_id = '' # while unique_id != user_id: # search_url = 'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622' % user_id # req = requests.get(url = search_url, verify = False) # html = json.loads(req.text) # aweme_count = html['user_list'][0]['user_info']['aweme_count'] # uid = html['user_list'][0]['user_info']['uid'] # nickname = html['user_list'][0]['user_info']['nickname'] # unique_id = html['user_list'][0]['user_info']['unique_id'] # user_url = 'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s' % (uid, aweme_count) # req = requests.get(url = user_url, verify = False) # html = json.loads(req.text) # i = 1 # for each in html['aweme_list']: # share_desc = each['share_info']['share_desc'] # if '抖音-原创音乐短视频社区' == share_desc: # video_names.append(str(i) + '.mp4') # i += 1 # else: # video_names.append(share_desc + '.mp4') # video_urls.append(each['share_info']['share_url']) with open(input_f) as f: for line in f: if line.startswith("http"): video_urls.append(line) video_names.append(self.date_today + "_" + str(i) + ".mp4") i += 1 return video_names, video_urls # video_names, video_urls, nickname def get_download_url(self, video_url): """ 获得带水印的视频播放地址 Parameters: video_url:带水印的视频播放地址 Returns: download_url: 带水印的视频下载地址 """ req = requests.get(url=video_url, verify=False) bf = BeautifulSoup(req.text, "lxml") script = bf.find_all("script")[-1] video_url_js = re.findall("var data = \[(.+)\];", str(script))[0] video_html = json.loads(video_url_js) download_url = video_html["video"]["play_addr"]["url_list"][0] return download_url def video_downloader(self, video_url, video_name, watermark_flag=True): """ 视频下载 Parameters: video_url: 带水印的视频地址 video_name: 视频名 watermark_flag: 是否下载不带水印的视频 Returns: 无 """ size = 0 if watermark_flag == True: video_url = self.remove_watermark(video_url) else: video_url = self.get_download_url(video_url) with closing(requests.get(video_url, stream=True, verify=False)) as response: chunk_size = 1024 content_size = int(response.headers["content-length"]) if response.status_code == 200: sys.stdout.write(" [文件大小]:%0.2f MB\n" % (content_size / chunk_size / 1024)) with open(video_name, "wb") as file: for data in response.iter_content(chunk_size=chunk_size): file.write(data) size += len(data) file.flush() sys.stdout.write(" [下载进度]:%.2f%%" % float(size / content_size * 100) + "\r") sys.stdout.flush() def remove_watermark(self, video_url): """ 获得无水印的视频播放地址 Parameters: video_url: 带水印的视频地址 Returns: 无水印的视频下载地址 """ self.driver.visit("http://douyin.iiilab.com/") self.driver.find_by_tag("input").fill(video_url) self.driver.find_by_xpath('//button[@class="btn btn-default"]').click() html = self.driver.find_by_xpath( '//div[@class="thumbnail"]/div/p')[0].html bf = BeautifulSoup(html, "lxml") return bf.find("a").get("href") def run(self, input_f): """ 运行函数 Parameters: None Returns: None """ # self.hello() # user_id = input('请输入ID(例如40103580):') error_url = [] video_names, video_urls = self.get_video_urls(input_f) if "douyin_download" + self.date_today not in os.listdir(): os.mkdir("douyin_download" + self.date_today) print("视频下载中:共有%d个作品!\n" % len(video_urls)) for num in range(len(video_urls)): print(" 解析第%d个视频链接 [%s] 中,请稍后!\n" % (num + 1, video_urls[num])) random_wait = random.uniform(3, 5) print("waiting...", random_wait) time.sleep(random_wait) if "\\" in video_names[num]: video_name = video_names[num].replace("\\", "") elif "/" in video_names[num]: video_name = video_names[num].replace("/", "") else: video_name = video_names[num] try: self.video_downloader( video_urls[num], os.path.join("douyin_download" + self.date_today, video_name), ) except: print("**************************") print("ERROR", video_urls[num]) error_url.append(video_urls[num]) print("\n") self.driver.close() with open("error_url.txt", "w") as f: f = error_url print("下载完成!") print("出错数量:", len(error_url))
""" Simply visit a URL to get some information, just for improve in future, adding various ways to get data from page with splinter API """ from splinter.browser import Browser browser = Browser() # Visit URL url = "http://splinter.cobrateam.info/" browser.visit(url) # by property print 'URL:', browser.url print 'Page Title:', browser.title # method print 'H1:', browser.find_by_tag('h1').first.value print 'Total Links:', len(browser.find_by_tag('a')) browser.quit()
class LemonLemon_douyin(object): def __init__(self, width=500, height=300): """ 抖音App视频下载 """ # 无头浏览器 chrome_options = Options() chrome_options.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"' ) self.driver = Browser(driver_name="chrome", options=chrome_options, headless=True) self.tool = Tools() def get_video_urls(self, input_f): """ 获得视频播放地址 Parameters: user_id:查询的用户ID Returns: video_names: 视频名字列表 video_urls: 视频链接列表 nickname: 用户昵称 """ video_names = [] video_urls = [] i = 1 now_date = datetime.datetime.now() self.date_today = (str(now_date.date()) + "_" + str(now_date.hour) + ":" + str(now_date.minute)) with open(input_f) as f: for line in f: info = eval(line) ID = info["ID"] des_md5 = info["des_md5"] url = info["link"] video_urls.append(url) video_names.append(ID + "_" + des_md5 + ".mp4") i += 1 return video_names, video_urls # video_names, video_urls, nickname def get_download_url(self, video_url): """ 获得带水印的视频播放地址 Parameters: video_url:带水印的视频播放地址 Returns: download_url: 带水印的视频下载地址 """ req = requests.get(url=video_url, verify=False) bf = BeautifulSoup(req.text, "lxml") script = bf.find_all("script")[-1] video_url_js = re.findall("var data = \[(.+)\];", str(script))[0] video_html = json.loads(video_url_js) download_url = video_html["video"]["play_addr"]["url_list"][0] return download_url def video_downloader(self, video_url, video_name, watermark_flag=True): """ 视频下载 Parameters: video_url: 带水印的视频地址 video_name: 视频名 watermark_flag: 是否下载不带水印的视频 """ size = 0 if watermark_flag == True: video_url = self.remove_watermark(video_url) else: video_url = self.get_download_url(video_url) with closing(requests.get(video_url, stream=True, verify=False)) as response: chunk_size = 1024 content_size = int(response.headers["content-length"]) if response.status_code == 200: sys.stdout.write(" [文件大小]:%0.2f MB\n" % (content_size / chunk_size / 1024)) with open(video_name, "wb") as file: for data in response.iter_content(chunk_size=chunk_size): file.write(data) size += len(data) file.flush() sys.stdout.write(" [下载进度]:%.2f%%" % float(size / content_size * 100) + "\r") sys.stdout.flush() def remove_watermark(self, video_url): """ 获得无水印的视频播放地址 Parameters: video_url: 带水印的视频地址 Returns: 无水印的视频下载地址 """ self.driver.visit("http://douyin.iiilab.com/") self.driver.find_by_tag("input").fill(video_url) self.driver.find_by_xpath('//button[@class="btn btn-default"]').click() html = self.driver.find_by_xpath( '//div[@class="thumbnail"]/div/p')[0].html bf = BeautifulSoup(html, "lxml") return bf.find("a").get("href") def run(self, input_f): """ 运行函数 Parameters: None Returns: None """ # self.hello() # user_id = input('请输入ID(例如40103580):') error_url = [] video_names, video_urls = self.get_video_urls(input_f) file = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), "DOWNLOAD") if not os.path.exists(file): os.mkdir(file) print("视频下载中:共有%d个作品!\n" % len(video_urls)) for num in range(len(video_urls)): print(" 解析第%d个视频链接 [%s] 中,请稍后!\n" % (num + 1, video_urls[num])) random_wait = random.uniform(3, 5) print("waiting...", random_wait) time.sleep(random_wait) """ if "\\" in video_names[num]: video_name = video_names[num].replace("\\", "") elif "/" in video_names[num]: video_name = video_names[num].replace("/", "") else: video_name = video_names[num] """ video_name = video_names[num] ID = video_name.split("_")[0] # 判断文件夹是否存在 if ID not in os.listdir(file): os.mkdir(os.path.join(file, ID)) # 判断要下载的文件是否存在 video_file = os.path.join(file, ID, video_name) if not os.path.exists(video_file): try: self.video_downloader( video_urls[num], os.path.join(file, ID, video_name), ) self.tool.writeToFile(video_name, "SuccessDownload") except: print("**************************") print("ERROR", video_urls[num]) error_url.append(video_urls[num]) print("\n") else: print(video_name + "文件已存在") #self.driver.close() with open("error_url.txt", "w") as f: f = error_url print("下载完成!") print("出错数量:", len(error_url))
class DouYin(object): def __init__(self, width = 500, height = 300): """ 抖音App视频下载 """ # 无头浏览器 chrome_options = Options() chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"') self.driver = Browser(driver_name='chrome', executable_path='D:/chromedriver', options=chrome_options, headless=True) def get_video_urls(self, user_id): """ 获得视频播放地址 Parameters: user_id:查询的用户ID Returns: video_names: 视频名字列表 video_urls: 视频链接列表 nickname: 用户昵称 """ video_names = [] video_urls = [] unique_id = '' while unique_id != user_id: search_url = 'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622' % user_id req = requests.get(url = search_url, verify = False) html = json.loads(req.text) aweme_count = html['user_list'][0]['user_info']['aweme_count'] uid = html['user_list'][0]['user_info']['uid'] nickname = html['user_list'][0]['user_info']['nickname'] unique_id = html['user_list'][0]['user_info']['unique_id'] user_url = 'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s' % (uid, aweme_count) req = requests.get(url = user_url, verify = False) html = json.loads(req.text) i = 1 for each in html['aweme_list']: share_desc = each['share_info']['share_desc'] if '抖音-原创音乐短视频社区' == share_desc: video_names.append(str(i) + '.mp4') i += 1 else: video_names.append(share_desc + '.mp4') video_urls.append(each['share_info']['share_url']) return video_names, video_urls, nickname def get_download_url(self, video_url): """ 获得带水印的视频播放地址 Parameters: video_url:带水印的视频播放地址 Returns: download_url: 带水印的视频下载地址 """ req = requests.get(url = video_url, verify = False) bf = BeautifulSoup(req.text, 'lxml') script = bf.find_all('script')[-1] video_url_js = re.findall('var data = \[(.+)\];', str(script))[0] video_html = json.loads(video_url_js) download_url = video_html['video']['play_addr']['url_list'][0] return download_url def video_downloader(self, video_url, video_name, watermark_flag=True): """ 视频下载 Parameters: video_url: 带水印的视频地址 video_name: 视频名 watermark_flag: 是否下载不带水印的视频 Returns: 无 """ size = 0 if watermark_flag == True: video_url = self.remove_watermark(video_url) else: video_url = self.get_download_url(video_url) with closing(requests.get(video_url, stream=True, verify = False)) as response: chunk_size = 1024 content_size = int(response.headers['content-length']) if response.status_code == 200: sys.stdout.write(' [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024)) with open(video_name, "wb") as file: for data in response.iter_content(chunk_size = chunk_size): file.write(data) size += len(data) file.flush() sys.stdout.write(' [下载进度]:%.2f%%' % float(size / content_size * 100) + '\r') sys.stdout.flush() def remove_watermark(self, video_url): """ 获得无水印的视频播放地址 Parameters: video_url: 带水印的视频地址 Returns: 无水印的视频下载地址 """ self.driver.visit('http://douyin.iiilab.com/') self.driver.find_by_tag('input').fill(video_url) self.driver.find_by_xpath('//button[@class="btn btn-default"]').click() html = self.driver.find_by_xpath('//div[@class="thumbnail"]/div/p')[0].html bf = BeautifulSoup(html, 'lxml') return bf.find('a').get('href') def run(self): """ 运行函数 Parameters: None Returns: None """ self.hello() user_id = input('请输入ID(例如40103580):') video_names, video_urls, nickname = self.get_video_urls(user_id) if nickname not in os.listdir(): os.mkdir(nickname) print('视频下载中:共有%d个作品!\n' % len(video_urls)) for num in range(len(video_urls)): print(' 解析第%d个视频链接 [%s] 中,请稍后!\n' % (num+1, video_urls[num])) if '\\' in video_names[num]: video_name = video_names[num].replace('\\', '') elif '/' in video_names[num]: video_name = video_names[num].replace('/', '') else: video_name = video_names[num] self.video_downloader(video_urls[num], os.path.join(nickname, video_name)) print('\n') print('下载完成!') def hello(self): """ 打印欢迎界面 Parameters: None Returns: None """ print('*' * 100) print('\t\t\t\t抖音App视频下载小助手') print('\t\t作者:Jack Cui') print('*' * 100)
br.set_cookiejar(cj) browser = Browser() #As of March 27, 2016 inp = csv.reader(file(fd+'Complete_list.csv','rb')) head = inp.next() for e,i in enumerate(head): print e,i fd2 = 'g:/health_data/provider_urls/' for i in inp: if not re.search("^None|NOT SUBMITTED",i[2]): print i[1] try: outp = csv.writer(open(os.path.join(fd2,i[1]+'.csv'),'wb'),delimiter='\t') browser.visit(i[2]) sleep(1) try: need = browser.find_by_css('pre') proc = json.loads(need[0].text) except: need = browser.find_by_tag('body') proc = json.loads(re.sub('}.*?$','}',need[0].text)) for p in proc['provider_urls']: outp.writerow([p]) #call('taskkill /F /IM firefox.exe') except: traceback.print_exc()