def annotateResponse(self, response): info = response.info() try: mimeType, sep, mimeInfo = info["Content-Type"].partition(";") m = self.charsetRE.search(mimeInfo) if m is not None: encoding = m.group(1) else: encoding = None mimeType = mimeType.strip() except AttributeError: mimeType = "unknown/unknown" encoding = None except KeyError: mimeType = "unknown/unknown" encoding = None try: response.handler = self.mimeMap[mimeType] except KeyError: print("fallback") for glob, handler in self.globMap: if fnmatch(mimeType, glob): response.handler = handler break else: raise URLLookupError("No handler for MIME type: {0}") response.mimeType = mimeType response.encoding = encoding self.bufferResponse(response, info) response.url = urllib.parse.urlparse(response.geturl())
def login(self): """login web. """ # set Cookie Jar. cookie_jar = http.cookiejar.LWPCookieJar() cookie_support = urllib.request.HTTPCookieProcessor(cookie_jar) opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler) urllib.request.install_opener(opener) # get validate code. code = False #while not code: code = self.validate_code() print("Validate Code: %s" % code) # post request data. login_url = "http://%s/UserAction.do" % self.host data = urlencode({"method": "login", "userid": self.uid, "passWord": self.password, "validateCode": code}) request = urllib.request.Request(login_url, data.encode()) try: response = urllib.request.urlopen(request) except (URLError, HTTPError): self.write_log("Error: Access Login URL fail.", "%s") return else: if response.geturl().find("welcome.jsp") > 0: self.write_log(("Login feedback3 OK.", self.uid), "%s (%s)") return True else: self.write_log(("Error: Login fail.", self.uid), "%s (%s)") return False
def _process_uncached_url(self, url, no_description=False): request = urllib.request.Request(url, headers={ "User-Agent": self.userAgent, "Accept": self.acceptHeader }) try: startTime = time.time() response = urllib.request.urlopen(request, timeout=self.timeout) timeTaken = time.time() - startTime except socket.timeout: raise URLLookupError("Timed out") except urllib.error.URLError as err: raise URLLookupError(err.reason) except Exception as err: raise URLLookupError(type(err).__name__) try: newURL = response.geturl() if newURL != url and self.showRedirects: yield "→ <{0}>".format(newURL) self.annotateResponse(response) if response.contentLength is not None: sizeFormatted = formatBytes(response.contentLength) else: sizeFormatted = "unknown size" responseIter = iter(response.handler.processResponse(response, no_description=no_description)) firstLine = next(responseIter) for line in self.responseFormats: yield line.format(time=timeTaken, size=sizeFormatted, plugin=firstLine) for line in responseIter: yield line finally: response.close() del response.buf del response
def login(browser): browser.open('https://mbasic.facebook.com/login.php') browser.select_form(nr=0) browser.form['email'] = emailw browser.form['pass'] = passw if "checkpoint" in browser.submit().geturl(): print("[Checkpoint] Acc đã bị checkpoint 2fa") if token2fa: browser.select_form(nr=0) browser.form['approvals_code'] = requests.get("https://2fa.live/tok/"+token2fa).json()['token'] browser.submit() browser.select_form(nr=0) browser.form.find_control(name="name_action_selected").value = ['dont_save',] response=browser.submit() if "checkpoint" in response.geturl(): browser.select_form(nr=0) response=browser.submit() browser.select_form(nr=0) response=browser.submit(name='submit[This was me]') browser.select_form(nr=0) browser.form.find_control(name="name_action_selected").value = ['dont_save',] response=browser.submit() return response
payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post('http://httpbin.org/post', payload) print("#No.0001:") print(ret.text) url = 'https://www.baidu.com/' req = request.Request(url) response = request.urlopen(req) print("#No.1==>type of response:") content = response.read() con1 = response.readlines() con2 = response.info() con3 = response.getcode() con4 = response.geturl() print(content) print(con1, "\n", con2, "\n", con3, "\n", con4, "\n") url2 = 'http://blog.csdn.net/ritterliu/article/details/70243112' req2 = request.Request(url2) response2 = request.urlopen(req2) content2 = BeautifulSoup(response2.read(), "html5lib") print("#No.2==>", content2.title) print("#No.3==>", content2.find_all(name='h1')) namelist = content2.find_all(name='img') print("#No.4==>") for name in namelist: print(name)
def get_content(channel_id, config=None): channel = Channel.get(channel_id) logger = get_logger('rss', channel) if not config: def get_param(x): return channel.get_config_param(x) else: def get_param(x): return config[x] url = get_param('url') parser_rules = get_param( 'parser_rules' ) # A list of rules in the form slide_element;entry_item;regexp additional_rules = get_param( 'additional_rules') # A list of rules in the form entry_item;string filter = get_param('filter') exception_rules = get_param( 'exception_rules') # A list of rules in the form entry_item;string no_slides = get_param('no_slides') time_limit = get_param('time_limit') duration = get_param('duration') * 1000 template = get_param('template') theme = get_param('theme') min_age = datetime.now() - timedelta(days=time_limit) entries = feedparser_parse( url)['entries'][:no_slides] if not config or not config.get( 'feed') else config.get('feed') capsules = [] last_entries = [] with open(cache_path, 'r+') as f: fcntl.flock(f, fcntl.LOCK_EX) cache = json.load(f, cls=DateTimeDecoder) for entry in entries: if 'published_parsed' in entry: entry_age = datetime.fromtimestamp( mktime(entry['published_parsed'])) if entry_age >= min_age: last_entries.append(entry) else: entry_hash = hash_dict(entry) if entry_hash in cache: if cache[entry_hash] >= min_age: last_entries.append(entry) else: cache[entry_hash] = datetime.now() f.seek(0) json.dump(cache, f, cls=DateTimeEncoder) f.truncate() fcntl.flock(f, fcntl.LOCK_UN) for entry in last_entries: slide_content = {} link_page = None for slide_element, entry_item, regexp in [ rule.split(';') for rule in parser_rules ]: field, input_type = slide_element.split(':') if field not in slide_content: slide_content[field] = {} if entry_item == 'link_page': if not link_page: with urllib.request.urlopen(entry.link) as response: link_page = response.read().decode(errors='ignore') entry_item += "@" + response.geturl( ) # in case of redirect(s), geturl() returns the final url of the page item = link_page else: item = deep_get(entry, *entry_item.split('.')) value = get_value(item, regexp) if input_type == 'src' and not _is_url(value): ref_url = entry.link if entry_item.startswith('link_page@'): ref_url = entry_item.split('@')[1] value = urljoin(ref_url, value) slide_content[field].update({input_type: value}) for slide_element, string in [ rule.split(';') for rule in additional_rules ]: field, input_type = slide_element.split(':') if field not in slide_content: slide_content[field] = {} if string.lower() == 'qrcode': input_type = 'qrcode' string = entry.link slide_content[field].update({input_type: string}) if len(exception_rules) == 1 and not exception_rules[0].strip(): capsules.append( RssCapsule(theme=theme, slides=[ RssSlide(content=slide_content, template=template, duration=duration) ])) else: for entry_item, regexp in [ rule.split(';') for rule in exception_rules ]: if entry_item == 'link_page': if not link_page: with urllib.request.urlopen(entry.link) as response: link_page = response.read().decode(errors='ignore') entry_item += "@" + response.geturl( ) # in case of redirect(s), geturl() returns the final url of the page item = link_page else: item = deep_get(entry, *entry_item.split('.')) value = get_value(item, regexp) if filter and value is None: capsules.append( RssCapsule(theme=theme, slides=[ RssSlide(content=slide_content, template=template, duration=duration) ])) if not filter and value is not None: capsules.append( RssCapsule(theme=theme, slides=[ RssSlide(content=slide_content, template=template, duration=duration) ])) return capsules
def crawl(self, link): tryOnce = 0 robotParser = self.setupRobotParser(link) if robotParser.can_fetch("*", link): while True: try: response = urllib.request.urlopen(link) break except urllib.error.HTTPError as e: if e.code == 429: if tryOnce == 1: print( 'Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' returning.') return print('Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' trying again in 120 seconds.') sleep(120) tryOnce = 1 else: return # for handling any other url errors except: print('Error opening link: ',link, " by thread : ", self.crawlerID) return returnedLink = response.geturl() if returnedLink != link: print('Thread ' + str(self.crawlerID) + ': Redirection:' + link + ' to ' + returnedLink + ' returning.') return urlInfo = response.info() dataType = urlInfo.get_content_type() if 'html' not in dataType: print('Thread ' + str(self.crawlerID) + ': Not HTML ' + link + ' returning.') return try: webContent = response.read().decode(response.headers.get_content_charset('utf-8')) except: print("Incomplete Read of web content due to a defective http server.") webContent = None if(webContent): Crawler.webpagesLock.acquire() if Crawler.webpagesSaved < NUMOFPAGES: Crawler.webpagesSaved += 1 else: print('Thread ' + str(self.crawlerID) + ': Page number limit reached ') Crawler.webpagesLock.release() return Crawler.webpagesLock.release() selector = None while True: try: selector = WebPages.select().where(WebPages.pageURL == returnedLink).exists() break except (OperationalError , sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable') except: break if selector: print('Thread ' + str(self.crawlerID) + ': Updating webpage ' + link) while True: try: WebPages.update(pageContent=webContent).where( WebPages.pageURL == returnedLink).execute() break except (OperationalError, sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable') except: break else: print('Thread ' + str(self.crawlerID) + ': Saving webpage ' + link ) try: inserted = False while True: try: if not inserted: WebPages(pageURL=returnedLink, pageContent=webContent).save() inserted = True ... PageRank.create(pageURL=returnedLink).update() ... break except (OperationalError, sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable & PageRank') sleep(randint(1,5)) except: break #should never happen except: print('UnexpectedException: In saving webpage WEEEEEEEEEEEEEEEEEEEEEEE') print('Thread ' + str(self.crawlerID) + ': Done saving webpage and starting link extraction ' + link) try: parser = MyHTMLParser(link) parser.feed(str(webContent)) #should never happen except: print('UnexpectedException: in parser WEEEEEEEEEEEEEEEEEEEEEEE') size = 999 while True: try: for i in range(0, len(parser.links), size): UncrawledTable.insert_many(parser.links[i:i + size]).upsert().execute() break except (OperationalError, sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. UnCrawledTable') except: break while True: try: print("UNCRAWLED URLS = ", UncrawledTable.select().count(), ' Thread ' + str(self.crawlerID)) break except (OperationalError, sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. print UnCrawledTable') except: break print('Thread ' + str(self.crawlerID) + ': Done inserting links ' + link)
# # headers = {'User-Agent': user_agent} # # queryval = {'q': 'hello'} # # querystr = urllib.parse.urlencode(queryval) # # url = url + '?' + querystr # # req = urllib.request.Request(url, None, headers) # # # with urllib.request.urlopen(req) as response: # # # rtnpage = response.read().decode('utf-8') # # # print(rtnpage) # # try: # # urllib.request.urlopen(req) # # except urllib.error.HTTPError as e: # # print(e.code) # # print(e.read()) if __name__ == '__main__': with urllib.request.urlopen('http://python.org') as response: print('real url---', response.geturl()) for k, v in response.info().items(): print(k, '==', v) html = response.read() import subprocess import os.path if os.path.exists('tmp.html') is True and os.path.isfile( 'tmp.html') is True: print('remove tmp.html') os.remove('tmp.html') rtnv = subprocess.check_output(['touch', 'tmp.html']) with open('./tmp.html', 'r+') as filehandle: for line in html.splitlines(): filehandle.write(line.decode('utf-8')) filehandle.write('\n')
#!/usr/bin/python #!/usr/bin/env python # encoding=UTF-8 import urllib.request as request import urllib.response import json import sys GossipingIndexUrl = "https://www.ptt.cc/bbs/Gossiping/index.html" #httpsConOpener = urllib.request.build_opener() #response = httpsConOpener.open(GossipingIndexUrl) response = urllib.request.urlopen(GossipingIndexUrl) resp_url = response.geturl() #resp_info = response.info() #resp_content = response.read().decode("UTF-8") #print(resp_url) #print(resp_info) #print(resp_content) if "ask/over18" in resp_url: print("ooops") post_req =request.Request(resp_url, b"yes:yes") response = request.urlopen(post_req) print(response.geturl()) print(response.info()) print(response.read())
# -*- coding:utf-8 -*- import urllib.request import urllib.response class RedirectHandler(urllib.request.HTTPRedirectHandler): def http_error_301(self, req, fp, code, msg, headers): pass def http_error_302(self, req, fp, code, msg, headers): result = urllib.request.HTTPRedirectHandler.http_error_302( self, req, fp, code, msg, headers) result.status = code result.newurl = result.geturl() return result opener = urllib.request.build_opener(RedirectHandler) url = "http://www.baidu.com" response = opener.open(url) data = response.read().decode() print(data) print(response.geturl())