def scrape(html_content): """ This function could have been a "private" utility but I left it "public" (note the quotation marks, as in Python there is no such thing as public or private access identifiers) because this scraping functionality could certainly be invoked passing raw HTML content, effectively circumventing the actual HTTP request. This has been useful, for instance, for unit testing this behavior. :param html_content: a string containing the actual HTML content. Cannot be None or empty. :return: a dictionary with two keys: "total" and "top5", containing the total number of elements and the count of the top 5 ones, respectively. """ if not html_content: raise ValueError('Input is empty') parser = HtmlElementsCounter() parser.feed(html_content) parser.close() # instructs the parser to consume the input entirely total = sum(parser.occurrences_by_tag.values()) # if the input only has N different elements (N < 5), this dictionary will hold exactly N entries top5_elements_with_occurrences = sorted(parser.occurrences_by_tag.items(), reverse=True, key=lambda x: x[1])[:5] return dict(total=total, top5=top5_elements_with_occurrences)
def get_dependencies(path): deps = set() parser = DependenciesParser(deps.add) with open(path) as f: parser.feed(f.read()) parser.close() return iter(deps)
def auth_usr(email, password, client_id, scope, opener): print("TRY TO AUTH") #TODO словить эксепшн login_page = "http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \ "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope)) #print(login_page) auth_page = opener.open("http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \ "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope))) auth_page = auth_page.read() parser = AuthParser() parser.feed(str(auth_page)) parser.close() if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \ "email" not in parser.params or parser.method != "POST": parser.error = "Some problems" if parser.error != "OK": return -1, -1, parser.error parser.params["email"] = email parser.params["pass"] = password parser.params["v"] = "5.2" #TODO словить эксепшн response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode("UTF-8")) page = response.read() url = response.geturl() return page, url, parser.error
def getentry(self): # Start with the entry from the parent. entry = FileHandler.getentry(self) parser = HTMLTitleParser() file = self.vfs.open(self.getselector(), "rt") try: while not parser.gotcompletetitle: line = file.readline() if not line: break parser.feed(line) parser.close() except html.parser.HTMLParseError: # Parse error? Stop parsing, go to here. We can still # return a title if the parse error happened after we got # the title. pass file.close() # OK, we've parsed the file and exited because of either an EOF # or a complete title (or error). Now, figure out what happened. if parser.gotcompletetitle: # Convert all whitespace sequences to a single space. # Removes newlines, tabs, etc. Good for presentation # and for security. title = re.sub('[\s]+', ' ', parser.titlestr) entry.setname(title) return entry
def run(self): self.parent.visited+=[self.url] if self.gleb>0: parser = self.HParser(self.url) contents = "" try: response = urlopen(self.url) contents = response.read().decode('utf-8') except HTTPError: pass parser.feed(contents) parser.close() for v in parser.vals: #przeszukujemy podstrony if not v in self.parent.visited: thr = Probe.Thr(v,self.fun,self.gleb-1, self.parent) thr.start() self.parent.threads += [thr] self.children += [thr] self.fun(self.url) #uruchamiamy akcje na biezacej stronie #czekamy na watki potomne for t in self.children: t.join()
def _query_eol_list(self) -> typing.List[str]: """Scrape the FreeBSD website and return a list of EOL RELEASES.""" request = urllib.request.Request( self.eol_url, headers={ "Accept-Charset": "utf-8" } ) self.logger.verbose(f"Downloading EOL info from {self.eol_url}") with urllib.request.urlopen(request) as response: # nosec: B310 response_code = response.getcode() if response_code != 200: # noqa: T484 libioc.errors.DownloadFailed( topic="EOL Warnings", code=response_code, logger=self.logger, level="warning" ) return [] parser = EOLParser() data = response.read().decode("utf-8", "ignore") parser.feed(data) parser.close() return parser.eol_releases
def simplify_html(s): """Make a real HTML text compatible with Telegram's pseudo-HTML""" parser = _HtmlSimplifying() parser.feed(s) parser.close() return parser.result
def main(): """Make all of the above work together to finally print the RSS feed.""" # Initial request html_string = get_response_body('/archive?type=episodes') # Prepare headers for following requests HEADERS['Referer'] = ( 'https://www.thisamericanlife.org/archive?type=episodes' ) HEADERS['X-Requested-With'] = 'XMLHttpRequest' parser = Parser() parser.feed(html_string) tree = parser.close() episodes = findall_episodes(tree) count = tree.find('.//div[@class="count-sort"]/div[@class="count"]').text count = int(count.split()[2]) for page in range(int(count / 48)): page = page + 1 time.sleep(1) json_string = get_response_body(f'/archive?type=episodes&page={page}') html_string = json.loads(json_string)['html'] parser = Parser() parser.feed(html_string) tree = parser.close() new_episodes = findall_episodes(tree) episodes = episodes + new_episodes RSS['rss']['channel']['item'] = episodes xml_tree = dictionary_to_xml(RSS) xml_string = xml.etree.ElementTree.tostring( xml_tree, encoding='utf-8', method='xml' ).decode() print(xml_string)
def _run_check(self, source, expected_events, collector=EventCollector): parser = collector() for s in source: parser.feed(s) parser.close() events = parser.get_events() if events != expected_events: self.fail("received events did not match expected events\n" "Expected:\n" + pprint.pformat(expected_events) + "\nReceived:\n" + pprint.pformat(events))
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r', encoding='utf8') data = f.read() f.close() parser = MyHTMLParser( formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO()))) parser.feed(data) parser.close() return parser.get_anchorlist()
def access(page, opener): parser = AuthParser() parser.feed(str(page)) parser.close() if not parser.form_parsed or parser.url is None or parser.method != "POST": parser.error = "Problems with giving access" return -1, -1, parser.error #TODO словить эксепшн response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode("UTF-8")) return response.geturl()
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r', encoding='utf8') data = f.read() f.close() parser = MyHTMLParser(formatter.AbstractFormatter( formatter.DumbWriter(io.StringIO()))) parser.feed(data) parser.close() return parser.get_anchorlist()
def parse_mse_text(text, ignore_soft_newlines=True): parser = MSETextParser(ignore_soft_newlines=ignore_soft_newlines) parser.feed(text) parser.close() result = parser.result.replace('•', '\n•') # add line breaks before bullet points because the parser removes soft line breaks while ' ' in result: result = result.replace(' ', ' ') # remove double spaces that can be generated by replacing soft newlines with spaces result = result.replace(' \n', '\n') # remove spaces before newlines result = result.replace('\n ', '\n') # remove spaces after newlines result = result.strip(' ') # remove spaces at start/end return result.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'"), parser.color_identity
def ExtractNamespaces(html): """ Extract the list of namespaces from an html page. """ try: parser = NamespacesFilter() parser.feed(html) finally: parser.close() return parser.namespaces
def give_access(doc, opener): parser = FormParser() parser.feed(doc.decode('utf-8')) parser.close() if not parser.form_parsed or parser.url is None: raise RuntimeError("Something wrong") if parser.method == "POST": response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode('utf-8')) else: raise NotImplementedError("Method '%s'" % params.method) return response.geturl()
def _give_access(self, doc): parser = _FormParser() parser.feed(str(doc)) parser.close() if not parser.form_parsed or parser.url is None: raise VKAuthError('Invalid email or password') if parser.method == 'post': response = self._opener.open(parser.url, urllib.parse.urlencode(parser.params).encode()) else: raise VKAuthError('Unexpected method: ' + parser.method) return response.geturl()
def parseFileRefs(htmlfile, usedFiles, skipFiles, indent, trace=print): """ find files referenced in root, recur for html files """ trace("%sParsing:" % ("." * indent), htmlfile) parser = MyParser(usedFiles, skipFiles, indent) text = open(htmlfile).read() try: parser.feed(text) except html.parser.HTMLParseError as E: print("==>FAILED:", E) # file's refs may be missed! parser.close()
def __give_access(self, doc): parser = FormParser() parser.feed(str(doc)) parser.close() if not parser.form_parsed or parser.url is None: raise VKAuthError(4, "Invalid email or password") if parser.method == "post": response = self.__opener.open(parser.url, urllib.parse.urlencode(parser.params).encode()) else: raise VKAuthError(5, "Method "+parser.method) return response.geturl()
def parseFileRefs(htmlfile, usedFiles, skipFiles, indent, trace=print): """ find files referenced in root, recur for html files """ trace('%sParsing:' % ('.' * indent), htmlfile) parser = MyParser(usedFiles, skipFiles, indent) text = open(htmlfile).read() try: parser.feed(text) except html.parser.HTMLParseError as E: print('==>FAILED:', E) # file's refs may be missed! parser.close()
def retrieve_capabilities(self, url, urlchain=[], pool=None, identity=None): """ connect to the given URL, retrieve and process the capabilities/withdrawals found there """ # detect loops in capability links if url in urlchain: return if not self._default_url: self.set_default_url(url) if isinstance(url, str): url = urllib3.util.parse_url(url) if identity is None: identity = self._tls_state.extract_peer_identity(url) if pool is None: if url.host is not None: pool = self._tls_state.pool_for(url.scheme, url.host, url.port) else: print("ConnectionPool not defined") exit(1) if url.path is not None: path = url.path else: path = "/" res = pool.request('GET', path) if res.status == 200: ctype = res.getheader("Content-Type") if ctype == "application/x-mplane+json": # Probably an envelope. Process the message. self.handle_message( mplane.model.parse_json(res.data.decode("utf-8")), identity) elif ctype == "text/html": # Treat as a list of links to capability messages. parser = CrawlParser(strict=False) parser.feed(res.data.decode("utf-8")) parser.close() for capurl in parser.urls: self.retrieve_capabilities(url=capurl, urlchain=urlchain + [url], pool=pool, identity=identity)
def retrieve_capabilities(self, url, urlchain=[], pool=None, identity=None): """ connect to the given URL, retrieve and process the capabilities/withdrawals found there """ # detect loops in capability links if url in urlchain: return if not self._default_url: self.set_default_url(url) if isinstance(url, str): url = urllib3.util.parse_url(url) if identity is None: identity = self._tls_state.extract_peer_identity(url) if pool is None: if url.host is not None: pool = self._tls_state.pool_for(url.scheme, url.host, url.port) else: print("ConnectionPool not defined") exit(1) if url.path is not None: path = url.path else: path = "/" print("Client path: "+ path) res = pool.request('GET', path) if res.status == 200: #ctype = res.getheader("Content-Type") ctype = res.headers['content-type'] print("Response: " + str(res.data)) print("Response content type: " + str(ctype)) if ctype == "application/x-mplane+json": # Probably an envelope. Process the message. self.handle_message( mplane.model.parse_json(res.data.decode("utf-8")), identity) elif ctype == "text/html": # Treat as a list of links to capability messages. parser = CrawlParser(strict=False) parser.feed(res.data.decode("utf-8")) parser.close() for capurl in parser.urls: self.retrieve_capabilities(url=capurl, urlchain=urlchain + [url], pool=pool, identity=identity)
def _give_access(self, doc, opener): parser = FormParser() parser.feed(doc.decode(encoding='UTF-8')) parser.close() if not parser.form_parsed or parser.url is None: raise RuntimeError("Something wrong") if parser.method.upper() == "POST": request_data = urllib.parse.urlencode(parser.params).encode( "utf-8") response = opener.open(parser.url, request_data) else: raise NotImplementedError("Method '%s'" % parser.method) return response.geturl()
def _give_access(self, doc, opener): parser = FormParser() parser.feed(doc.decode(encoding='UTF-8')) parser.close() if not parser.form_parsed or parser.url is None: raise RuntimeError("Something wrong") if parser.method.upper() == "POST": request_data = urllib.parse.urlencode( parser.params).encode("utf-8") response = opener.open(parser.url, request_data) else: raise NotImplementedError("Method '%s'" % parser.method) return response.geturl()
def _run_check(self, source, expected_events, collector=None): if collector is None: collector = self.get_collector() parser = collector for s in source: parser.feed(s) parser.close() events = parser.get_events() if events != expected_events: self.fail("received events did not match expected events" + "\nSource:\n" + repr(source) + "\nExpected:\n" + pprint.pformat(expected_events) + "\nReceived:\n" + pprint.pformat(events))
def give_access(doc, opener): parser = FormParser() parser.feed(doc.decode('utf-8')) parser.close() if not parser.form_parsed or parser.url is None: raise RuntimeError("Something wrong") if parser.method == "POST": response = opener.open( parser.url, urllib.parse.urlencode(parser.params).encode('utf-8')) else: raise NotImplementedError("Method '%s'" % params.method) return response.geturl()
def ExtractAllPages(html): """ Extract the list of wiki pagenames from an AllPages html output. """ if debug: print("extract all pages, htmlsize=%d" % len(html)) try: parser = AllpagesFilter() parser.feed(html) finally: parser.close() return parser
def html_to_text(html, maxcol=80): try: buffer = StringIO() formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol)) parser = HTMLParserAnchor(formatter) parser.feed(html) parser.close() text = buffer.getvalue() buffer.close() return text except Exception as e: syslog.syslog(syslog.LOG_ERR, 'cannot convert html to text: %s' % e) return None
def html_to_text(html_text): """ Convert HTML to plain text. :param html_text: A fragment of HTML (a string). :returns: The plain text (a string). This function uses the :class:`HTMLStripper` class that builds on top of the :class:`html.parser.HTMLParser` class in the Python standard library. """ parser = HTMLStripper() parser.feed(html_text) parser.close() return parser.output.getvalue()
def ExtractBaseurl(html): """ Finds wiki baseurl on html page """ try: parser = BaseurlFilter() parser.feed(html) finally: parser.close() if len(parser.baseurl)>1: print("baseurl: found multiple", parser.baseurl) if len(parser.baseurl)==0: raise Exception("no baseurl found") return max(parser.baseurl.items(), key=lambda kv:kv[1])[0]
def retrieve_capabilities(self, url, urlchain=[], pool=None, identity=None): """ Connect to the given URL, retrieve and process the capabilities/withdrawals found there """ # detect loops in capability links if url in urlchain: return if not self._default_url: self.set_default_url(url) if isinstance(url, str): url = urllib3.util.parse_url(url) if identity is None: identity = self._tls_state.extract_peer_identity(url) if pool is None: if url.host is not None: pool = self._tls_state.pool_for(url.scheme, url.host, url.port) else: raise ValueError("HttpInitiatorClient capability retrieval missing connection pool") if url.path is not None: path = url.path else: path = "/" res = pool.request('GET', path) if res.status == 200: ctype = res.getheader("Content-Type") if ctype == "application/x-mplane+json": # Probably an envelope. Process the message. self.handle_message( mplane.model.parse_json(res.data.decode("utf-8")), identity) elif ctype == "text/html": # Treat as a list of links to capability messages. parser = CrawlParser() parser.feed(res.data.decode("utf-8")) parser.close() for capurl in parser.urls: self.retrieve_capabilities(url=capurl, urlchain=urlchain + [url], pool=pool, identity=identity)
def _get_eol_list(self) -> typing.List[str]: """Scrapes the FreeBSD website and returns a list of EOL RELEASES""" request = urllib.request.Request(self.eol_url, headers={"Accept-Charset": "utf-8"}) with urllib.request.urlopen(request) as response: # nosec: B310 if response.getcode() != 200: # noqa: T484 iocage.lib.errors.DistributionEOLWarningDownloadFailed( logger=self.logger, level="warning") return [] parser = EOLParser() data = response.read().decode("utf-8", "ignore") parser.feed(data) parser.close() return parser.eol_releases
def auth_user(email, password, client_id, scope, opener): response = opener.open( "http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \ "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope)) ) doc = response.read() parser = FormParser() parser.feed(doc.decode('utf-8')) parser.close() if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \ "email" not in parser.params: raise RuntimeError("Something wrong") parser.params["email"] = email parser.params["pass"] = password if parser.method == "POST": response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode('utf-8')) else: raise NotImplementedError("Method '%s'" % parser.method) return response.read(), response.geturl()
def crawlSuperPingProxyList(self, target): self.currentHost = self.SuperPing self.client = http.client.HTTPConnection(self.host[self.currentHost], self.port, self.timeout) try: self.startRequest("GET", "/?ping={0}&locale=en".format(target)) self.getRawData("") parser = WebHTMLParser() parser.feed(self.rawData) parser.close() mdata = parser.get_parsed_data() pingData = [] while re.search("load\(\'\.([^\']+)", mdata): sm = re.search("load\(\'\.([^\']+)", mdata) mdata = mdata[sm.end(1):] pingData.append(sm.group(1)) self.superPing(pingData) except TimeoutError as err: logger.log(logging.ERROR, err) except Exception as err: logger.log(logging.ERROR, err)
def get_county(location): state_file = open(utils.get_project_dir() / "state_file.txt", "r") state_long_names_to_names = json.loads(state_file.read()) state_file.close() url_base = "http://www.openstreetmap.org/geocoder/search_osm_nominatim_reverse" url = form_url( url_base, { "lat": location.latitude, "lon": location.longitude, "zoom": 17, "minlon": -122, "minlat": 47, "maxlon": -121, "maxlat": 48 }) print(url) response = get_web_resource(url) print(response) class StreamingHTMLParser(html.parser.HTMLParser): def __init__(self): html.parser.HTMLParser.__init__(self) self.data = None def handle_starttag(self, tag_name, attributes): attributes_d = dict(attributes) if tag_name == "a" and "data-name" in attributes_d: self.data = attributes_d["data-name"] parser = StreamingHTMLParser() parser.feed(response) parser.close() match = re.search("(\w* County), ([^,]*)", parser.data) county_name = match.group(1) state_long_name = match.group(2) state_name = state_long_names_to_names[state_long_name] return (county_name, state_name)
def get_song_infos_from_deezer_website(search_type, id): # search_type: either one of the constants: TYPE_TRACK|TYPE_ALBUM|TYPE_PLAYLIST # id: deezer_id of the song/album/playlist (like https://www.deezer.com/de/track/823267272) # return: if TYPE_TRACK => song (dict grabbed from the website with information about a song) # return: if TYPE_ALBUM|TYPE_PLAYLIST => list of songs # raises # Deezer404Exception if # 1. open playlist https://www.deezer.com/de/playlist/1180748301 and click on song Honey from Moby in a new tab: # 2. Deezer gives you a 404: https://www.deezer.com/de/track/68925038 # Deezer403Exception if we are not logged in url = "https://www.deezer.com/de/{}/{}".format(search_type, id) resp = session.get(url) if resp.status_code == 404: raise Deezer404Exception( "ERROR: Got a 404 for {} from Deezer".format(url)) if "MD5_ORIGIN" not in resp.text: raise Deezer403Exception( "ERROR: we are not logged in on deezer.com. Please update the cookie" ) parser = ScriptExtractor() parser.feed(resp.text) parser.close() songs = [] for script in parser.scripts: regex = re.search(r'{"DATA":.*', script) if regex: DZR_APP_STATE = json.loads(regex.group()) global album_Data album_Data = DZR_APP_STATE.get("DATA") if DZR_APP_STATE['DATA']['__TYPE__'] == 'playlist' or DZR_APP_STATE[ 'DATA']['__TYPE__'] == 'album': # songs if you searched for album/playlist for song in DZR_APP_STATE['SONGS']['data']: songs.append(song) elif DZR_APP_STATE['DATA']['__TYPE__'] == 'song': # just one song on that page songs.append(DZR_APP_STATE['DATA']) return songs[0] if search_type == TYPE_TRACK else songs
def fragment_fromstring(text, parser=None, create_parent=None): """ Returns an HTML fragment from a string. The fragment must contain just a single element, unless create_parent is given; e.g,. fragment_fromstring(string, create_parent='div') will wrap the element in a <div>. """ if parser is None: parser = HTMLParser(TreeBuilder()) if create_parent: parser.feed("<%s>" % create_parent) parser.feed(text) if create_parent: parser.feed("</%s>" % create_parent) return parser.close()
def __init__(self, url, numCols, extractionMap, exceptions): # Request the html. request = urllib.request.Request(url) request.add_header("User-Agent",self.user_agent) try: response = urllib.request.urlopen(request) except: print("Error: Invalid URL. Exiting.") exit() htmlContent = response.read().decode("utf8") # Some files have <br> in the middle of a <td> tag, # and cause the parser to misinterpret the data. htmlContent = htmlContent.replace("<br>", "") # Parse the html. parser = CountryParser(numCols, extractionMap, exceptions, strict=False) htmlContent = parser.unescape(htmlContent) # Unescape HTML entities. parser.feed(htmlContent) parser.close() self.__countryData = parser.countryData
def discover(url, timeout=None): """Discover the hub url and topic url of a given url. Firstly, by inspecting the page's headers, secondarily by inspecting the content for link tags. timeout determines how long to wait for the url to load. It defaults to 3. """ resp = get_content({'REQUEST_TIMEOUT': timeout}, url) parser = LinkParser() parser.hub_url = (resp.links.get('hub') or {}).get('url') parser.topic_url = (resp.links.get('self') or {}).get('url') try: parser.updated() for chunk in resp.iter_content(chunk_size=None, decode_unicode=True): parser.feed(chunk) parser.close() except Finished: return {'hub_url': parser.hub_url, 'topic_url': parser.topic_url} raise DiscoveryError("Could not find hub url in topic page")
def _query_from_ip_cn(ip_str: str): command = """curl 'https://ip.cn/?ip=%(ip)s' -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'sec-fetch-site: same-origin' -H 'sec-fetch-mode: navigate' -H 'sec-fetch-user: ?1' -H 'sec-fetch-dest: document' -H 'referer: https://ip.cn/?ip=%(ip)s' -H 'accept-language: zh-CN,zh;q=0.9,en;q=0.8' --compressed -s """ result = dict() try: ips = ip_str str_command = command % dict(ip=ips) str_command = " ".join( filter(lambda x: x, map(str.strip, str_command.split("\n")))) with os.popen(str_command) as inf: ctx = inf.read() parser = _parser_cls() parser.feed(ctx) parser.close() ret_list = parser.get_ret_list() # result['ip'] = ret_list[0] result['desc_zh'] = ret_list[1] cpc = str.split(ret_list[2], ',') if len(ret_list) > 2 else [] result['city'] = str.strip(cpc[-3] if len(cpc) >= 3 else '') result['province'] = str.strip(cpc[-2] if len(cpc) >= 2 else '') result['country'] = str.strip(cpc[-1] if len(cpc) >= 1 else '') except Exception as err: result['error'] = str(err) pass return result
def get_windows_table(): global _windows_table # If we already loaded _windows_table, no need to load it all over again. if _windows_table: return _windows_table # windows-rcs.html was fetched on 2015-03-24 with the following command: # curl -o windows-rcs.html \ # https://msdn.microsoft.com/en-us/library/cc704588.aspx parser = TableParser() with open(os.path.join(os.path.dirname(__file__), "windows-rcs.html")) as hf: # We tried feeding the file data to TableParser in chunks, to avoid # buffering the entire file as a single string. Unfortunately its # handle_data() cannot tell the difference between distinct calls # separated by HTML tags, and distinct calls necessitated by a chunk # boundary. Sigh! Read in the whole file. At the time this was # written, it was only 500KB anyway. parser.feed(hf.read()) parser.close() table = parser.table # With our parser, any <tr><th>...</th></tr> row leaves a table entry # consisting only of an empty list. Remove any such. while table and not table[0]: table.pop(0) # We expect rows of the form: # [['0x00000000', 'STATUS_SUCCESS'], # ['The operation completed successfully.']] # The latter list will have multiple entries if Microsoft embedded <br/> # or <p> ... </p> in the text, in which case joining with '\n' is # appropriate. # Turn that into a dict whose key is the hex string, and whose value is # the pair (symbol, desc). _windows_table = dict( (key, (symbol, '\n'.join(desc))) for (key, symbol), desc in table) return _windows_table
def auth_user(email, password, client_id, scope, opener): response = opener.open( "http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \ "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope)) ) doc = response.read() parser = FormParser() parser.feed(doc.decode('utf-8')) parser.close() if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \ "email" not in parser.params: raise RuntimeError("Something wrong") parser.params["email"] = email parser.params["pass"] = password if parser.method == "POST": response = opener.open( parser.url, urllib.parse.urlencode(parser.params).encode('utf-8')) else: raise NotImplementedError("Method '%s'" % parser.method) return response.read(), response.geturl()
def get_tests(self, task, i, j): proto = fetch_protocol(self, task, i, j).decode('utf-8', 'replace') parser = TestsParser() parser.feed(proto.split('\n', 2)[-1]) parser.close() tests = [] for i in parser.test_meta: meta = {} files = {} for k, v in i.items(): if k == 'checker-comment': k = 'checker' if k in ('input', 'output', 'stderr', 'correct', 'checker'): files[k] = v else: meta[k] = v tests.append((meta, files)) compiler_output = parser.global_meta.get('compiler_output', '') try: del parser.global_meta['compiler_output'] except KeyError: pass return (compiler_output, parser.global_meta, tests)
def get_place_data(place_name): url_base = "http://openstreetmap.org/geocoder/search_osm_nominatim" url = form_url(url_base, {"query": place_name}) print(url) response = get_web_resource(url) print(response) class StreamingHTMLParser(html.parser.HTMLParser): def __init__(self): html.parser.HTMLParser.__init__(self) self.data = None self.data_found = False def handle_starttag(self, tag_name, attributes): if self.data_found: return attributes_d = dict(attributes) if tag_name == "li": if self.data is None: self.data = dict() if tag_name == "a" and "data-lat" in attributes_d: self.data["location"] = Location(attributes_d["data-lon"], attributes_d["data-lat"]) self.data_found = True def handle_data(self, chars): if self.data_found: return if self.data is not None: if "location" not in self.data: self.data["usage"] = chars[:-1] parser = StreamingHTMLParser() parser.feed(response) parser.close() return parser.data
def _auth_user(self): url = AUTH_URL % (self.app_id, self.scope) response = self.opener.open(url) doc = response.read().decode(encoding='UTF-8') parser = FormParser() parser.feed(doc) parser.close() if not parser.form_parsed \ or parser.url is None \ or "pass" not in parser.params \ or "email" not in parser.params: raise RuntimeError("Something wrong") parser.params["email"] = self.login parser.params["pass"] = self.password if parser.method.upper() == "POST": request_data = urllib.parse.urlencode(parser.params).encode( "utf-8") response = self.opener.open(parser.url, request_data) else: raise NotImplementedError("Method '%s'" % parser.method) return response.read(), response.geturl()
def _auth_user(self): url = AUTH_URL % (self.app_id, self.scope) response = self.opener.open(url) doc = response.read().decode(encoding='UTF-8') parser = FormParser() parser.feed(doc) parser.close() if not parser.form_parsed \ or parser.url is None \ or "pass" not in parser.params \ or "email" not in parser.params: raise RuntimeError("Something wrong") parser.params["email"] = self.login parser.params["pass"] = self.password if parser.method.upper() == "POST": request_data = urllib.parse.urlencode( parser.params).encode("utf-8") response = self.opener.open(parser.url, request_data) else: raise NotImplementedError("Method '%s'" % parser.method) return response.read(), response.geturl()
def getentry(self): # Start with the entry from the parent. entry = FileHandler.getentry(self) parser = HTMLTitleParser() with self.vfs.open(self.getselector(), "rb") as fp: while not parser.gotcompletetitle: line = fp.readline() if not line: break # The PY3 HTML parser doesn't handle surrogateescape parser.feed(line.decode(errors="replace")) parser.close() # OK, we've parsed the file and exited because of either an EOF # or a complete title (or error). Now, figure out what happened. if parser.gotcompletetitle: # Convert all whitespace sequences to a single space. # Removes newlines, tabs, etc. Good for presentation # and for security. title = re.sub(r"[\s]+", " ", parser.titlestr) entry.setname(title) return entry
def parse(source=source): parser = html.parser.HTMLParser() parser.feed(source) parser.close()
def auth(login, passwd, appid, scope): if not isinstance(scope, list): scope = [scope] _opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()), urllib.request.HTTPRedirectHandler()) try: response = _opener.open( 'http://oauth.vk.com/oauth/authorize?' + \ 'redirect_uri=oauth.vk.com/blank.html&response_type=token&' + \ 'client_id={0}&scope={1}&display=wap'.format(appid, ','.join(scope)) ) except urllib.error.URLError as e: raise VKAuthError('Cant connect to vk.com or app_id is invalid.') except Exception as e: raise VKAuthError('Unhandled exception: ' + str(e)) doc = response.read().decode() parser = _FormParser() parser.feed(doc) parser.close() if not parser.form_parsed or parser.url is None or 'pass' not in parser.params or 'email' not in parser.params: raise VKAuthError('Unexpected response page o_O') parser.params['email'] = login parser.params['pass'] = passwd parser.method = 'POST' keys = [i for i in parser.params] for i in keys: b = '1'.encode() if type(i) != type(b): a = i.encode() else: a = i if type(parser.params[i]) != type(b): parser.params[a] = parser.params[i].encode() else: parser.params[a] = parser.params[i] parser.params.pop(i) response = _opener.open(parser.url, urllib.parse.urlencode(parser.params).encode()) doc = response.read() url = response.geturl() if urllib.parse.urlparse(url).path != '/blank.html': parser = _FormParser() parser.feed(str(doc)) parser.close() if not parser.form_parsed or parser.url is None: raise VKAuthError('Invalid email or password') if parser.method == 'post': response = _opener.open(parser.url, urllib.parse.urlencode(parser.params).encode()) else: raise VKAuthError('Unexpected method: ' + parser.method) url = response.geturl() if urllib.parse.urlparse(url).path != "/blank.html": raise VKAuthError('Invalid email or password') answer = dict(tuple(kv_pair.split('=')) for kv_pair in urllib.parse.urlparse(url).fragment.split('&')) if 'access_token' not in answer or 'user_id' not in answer: raise VKAuthError('Missing some values in answer') return answer['access_token'], answer['user_id'], answer['expires_in']
def parse(source=source): parser = self.get_collector() parser.feed(source) parser.close()
"Referer": "http://mobile.9om.com/", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", }, method="GET", ) try: with urllib.request.urlopen(req, timeout=33) as f: print("status", f.status, f.reason) res = f.read().decode("utf-8") parser = MyHTMLPraser() parser.feed(res) parser.close() except urllib.error.HTTPError as e: print("Server could not fulfill the request,erro code", e.code, e.info()) except urllib.error.URLError as e: print("We failed to reach the server:", e.reason) except socket.timeout: print("Time out") # saveMyData() 如果超时你强制终止程序,会导致爬虫结果文件数据未写入变成空文件 count = 0 count += 1 if count % 9 == 0: saveMyData() count = 0
def parser_test(html_str): #parser html source file parser = MyHTMLParser(strict = False) parser.feed(html_str) parser.close()
def discover(url): '''Perform service discovery on the OP URL. Return list of service types, and the auth/2.0 URL, or None if discovery fails.''' scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url) assert not fragment if scheme == 'https': conn = http.client.HTTPSConnection(netloc) elif scheme == 'http': conn = http.client.HTTPConnection(netloc) else: raise ValueError("Unsupported scheme " + scheme) # conn.set_debuglevel(1) if query: path += '?'+query try: conn.connect() except: # DNS or TCP error return None # httplib in 2.5 incorrectly sends https port in Host # header even if it is 443 conn.putrequest("GET", path, skip_host=1) conn.putheader('Host', netloc) conn.putheader('Accept', "text/html; q=0.3, "+ "application/xhtml+xml; q=0.5, "+ "application/xrds+xml") conn.endheaders() res = conn.getresponse() data = res.read() conn.close() if res.status in (301, 302, 303, 307): return discover(res.msg.get('location')) if sys.version_info < (3,0): content_type = res.msg.gettype() else: content_type = res.msg.get_content_type() # Yadis 6.2.5 option 2 and 3: header includes x-xrds-location xrds_loc = res.msg.get('x-xrds-location') if xrds_loc and content_type != 'application/xrds+xml': return discover(xrds_loc) if content_type in ('text/html', 'application/xhtml+xml'): parser = OpenIDParser() parser.feed(data.decode('latin-1')) parser.close() # Yadis 6.2.5 option 1: meta tag if parser.xrds_location: return discover(parser.xrds_location) # OpenID 7.3.3: attempt html based discovery op_endpoint = parser.links.get('openid2.provider') if op_endpoint: op_local = parser.links.get('openid2.local_id') return ['http://specs.openid.net/auth/2.0/signon'], op_endpoint, op_local # 14.2.1: 1.1 compatibility op_endpoint = parser.links.get('openid.server') if op_endpoint: op_local = parser.links.get('openid.delegate') return ['http://openid.net/signon/1.1'], op_endpoint, op_local # Discovery failed return None elif content_type == 'application/xrds+xml': # Yadis 6.2.5 option 4 doc = ElementTree.fromstring(data) return _extract_services(doc) else: # unknown content type return None return services, op_endpoint, op_local
def __init__(self, email, password, client_id, permissions): """ VKAuth(email, password, application id, scope) Where scope is a list of permissions like ['friends', 'photos'] If object is successfully initialised, "access token" will be in VKAuth.access_token and "user id" in VKAuth.user_id raises VKAuthError on errors """ if not isinstance(permissions, list): permissions = [permissions] for element in permissions: if element not in self.valid_scope: raise VKAuthError(1, 'invalid scope element: '+element) self.__opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()), urllib.request.HTTPRedirectHandler()) try: response = self.__opener.open( "http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=oauth.vk.com/blank.html&response_type=token&" + \ "client_id={0}&scope={1}&display=wap".format(client_id, ",".join(permissions)) ) except urllib.error.URLError as E: raise VKAuthError(2, 'internet connection failed') except Exception as E: raise VKAuthError(0, 'Unhandled exception: '+str(e)) doc = response.read().decode() parser = FormParser() parser.feed(doc) parser.close() if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \ "email" not in parser.params: raise VKAuthError(3, 'wrong response page oO') parser.params["email"] = email parser.params["pass"] = password parser.method = 'POST' keys = [ i for i in parser.params] for i in keys: b = '1'.encode() if type(i)!=type(b): a = i.encode() else: a = i if type(parser.params[i])!=type(b): parser.params[a] = parser.params[i].encode() else: parser.params[a] = parser.params[i] parser.params.pop(i) try: response = self.__opener.open(parser.url, urllib.parse.urlencode(parser.params).encode()) except urllib.error.URLError as E: raise VKAuthError(2, 'internet connection failed') except Exception as E: raise VKAuthError(0, 'Unhandled exception: '+str(e)) doc = response.read() url = response.geturl() if urllib.parse.urlparse(url).path != "/blank.html": url = self.__give_access(doc) if urllib.parse.urlparse(url).path != "/blank.html": raise VKAuthError(4, "Invalid email or password") def split_key_value(kv_pair): kv = kv_pair.split("=") return kv[0], kv[1] answer = dict(split_key_value(kv_pair) for kv_pair in urllib.parse.urlparse(url).fragment.split("&")) if "access_token" not in answer or "user_id" not in answer: raise VKAuthError(5, "Missing some values in answer") self.access_token = answer["access_token"] self.user_id = answer["user_id"]