def __init__(self): HTMLParser.__init__(self) self.InRow=0 self.InEntry=0 self.table = [] self.tmpRow = [] self.hyperlinks = [] self.RunNumber = 0 self.TriggerRates = [] self.Nevts = [] self.LiveLumiByLS = [] self.DeliveredLumiByLS = [] self.FirstLS = -1 self.LastLS = -1 self.AvLiveLumi = [] self.AvDeliveredLumi = [] self.AvDeadtime = [] self.DeadTime = []#grant self.L1Prescales=[] self.RunPage = '' self.RatePage = '' self.LumiPage = '' self.L1Page='' self.L1_LS_Page = ''#grant self.PrescaleColumn=[] self.PrescaleColumnString = ''
def __init__(self): HTMLParser.__init__(self) self.lasttag = None self.title = "" self.pagedata = StringIO() self.links = [] self.inbody = False
def __init__(self): HTMLParser.__init__(self) self.trouve=False self.encours=False self.reponse={} self.correspondance={ 'CVSS Score':'cvss_score', 'Confidentiality Impact':'confidentialite', 'Integrity Impact':'integrite', 'Availability Impact':'disponibilite', 'Access Complexity':'complexite', 'Authentication':'authentification', 'Vulnerability Type(s)':'type', 'CWE ID':None, 'Gained Access':'acces_obtention' } self.reponse={ 'cvss_score':None, 'confidentialite':None, 'integrite':None, 'disponibilite':None, 'complexite':None, 'authentification':None, 'type':None, 'acces_obtention':None } self.precedent=None
def __init__(self): HTMLParser.__init__(self) self.tracks = [] self.endDate = None self.curData = '' self.curTrack = {} self.recording = None
def __init__(self, site = None): HTMLParser.__init__(self) dict.__init__(self, ()) self.in_form = False self.select = None if site: self.load(site)
def feed(self, token): ttype, tvalue, tstart, tend, tline = token self.line = tline # Handle whitespace (prev_row, prev_col) = self.lastPos (cur_row, cur_col) = tstart (end_row, end_col) = tend assert cur_row >= prev_row, "Unexpected jump in row" self.lastPos = (end_row, end_col) # are we now on a new line? if cur_row > prev_row: self._appendRows(cur_row - prev_row) # are we on a muliline statement? if end_row > cur_row: self._appendRows(end_row - cur_row) # interpret jumps on the same line as a single space if cur_row == prev_row and cur_col > prev_col: HTMLParser.feed(self, ' ') HTMLParser.feed(self, tvalue)
def get_jobs(self): try: jobs_start_time = time.time() h = HTMLParser() html = h.unescape(self.browser.page_source).encode('utf-8').decode('ascii', 'ignore') soup = BeautifulSoup(html, 'html.parser') data = soup.findAll('a', id=lambda x: x and x.startswith('popup')) counter = 0 for a in data: if a.has_attr('href'): counter = counter + 1 #self.DrawSpinner(counter) try: return_code = self.get_job_info(self.browser, self.base_job_url + a['href'].split('?')[1]) if return_code == 1: #In case the error pages starts to come jobs_end_time = time.time() print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time) return except Exception: continue jobs_end_time = time.time() print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time) except Exception as e: print 'exception= ', str(e) #print 'stacktrace= ', traceback.print_exc() print 'Line Number= ' + str(sys.exc_traceback.tb_lineno)
def __init__(self): HTMLParser.__init__(self) self.url = None self.params = {} self.in_form = False self.form_parsed = False self.method = "GET"
def wolfplex(options): # clean events Event.objects.filter(source="wolfplex").delete() html_parser = HTMLParser() soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read()) events = soup.find("div", id="accueil-agenda").dl for date_info, event in zip(events('dt'), events('dd')[1::2]): if event.span: event.span.clear() title = html_parser.unescape(event.text) base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else "" url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org" start = parse(date_info.span["title"]) if "@" in title: title, location = title.split("@", 1) else: location = None Event.objects.create( title=title, source="wolfplex", url=url, start=start, location=location ) if not options["quiet"]: print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")
def __init__(self, properties): HTMLParser.__init__(self) self.properties = dict((key, value) for key, value in (prop.split(',') for prop in properties.split(';') if prop.find(',') > -1)) self.data = [] self.in_td = 0 self.tr_name = None
def __init__(self): """ Constructor; initializes washer """ HTMLParser.__init__(self) self.result = '' self.nb = 0 self.previous_nbs = [] self.previous_type_lists = [] self.url = '' self.render_unallowed_tags = False self.allowed_tag_whitelist = \ CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST self.allowed_attribute_whitelist = \ CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST # javascript: self.re_js = re.compile( ".*(j|j|J)"\ "\s*(a|a|A)"\ "\s*(v|v|V)"\ "\s*(a|a|A)"\ "\s*(s|s|S)"\ "\s*(c|c|C)"\ "\s*(r|r|R)"\ "\s*(i|Ã|I)"\ "\s*(p|p|P)"\ "\s*(t|p|T)"\ "\s*(:|:).*", re.IGNORECASE | re.DOTALL) # vbscript: self.re_vb = re.compile( ".*(v|v|V)"\ "\s*(b|b|B)"\ "\s*(s|s|S)"\ "\s*(c|c|C)"\ "\s*(r|r|R)"\ "\s*(i|Ã|I)"\ "\s*(p|p|P)"\ "\s*(t|p|T)"\ "\s*(:|:).*", re.IGNORECASE | re.DOTALL)
def __init__(self, new_path, filename, reference_support_info, host=Host(), convert_test_harness_links=True): HTMLParser.__init__(self) self._host = host self._filesystem = self._host.filesystem self._webkit_root = WebKitFinder(self._filesystem).webkit_base() self.converted_data = [] self.converted_properties = [] self.converted_property_values = [] self.in_style_tag = False self.style_data = [] self.filename = filename self.reference_support_info = reference_support_info resources_path = self.path_from_webkit_root('LayoutTests', 'resources') resources_relpath = self._filesystem.relpath(resources_path, new_path) self.new_test_harness_path = resources_relpath self.convert_test_harness_links = convert_test_harness_links # These settings might vary between WebKit and Blink self._css_property_file = self.path_from_webkit_root('Source', 'WebCore', 'css', 'CSSPropertyNames.in') self._css_property_value_file = self.path_from_webkit_root('Source', 'WebCore', 'css', 'CSSValueKeywords.in') self.test_harness_re = re.compile('/resources/testharness') self.prefixed_properties = self.read_webkit_prefixed_css_property_list(self._css_property_file) prop_regex = '([\s{]|^)(' + "|".join(prop.replace('-webkit-', '') for prop in self.prefixed_properties) + ')(\s+:|:)' self.prop_re = re.compile(prop_regex) self.prefixed_property_values = self.read_webkit_prefixed_css_property_list(self._css_property_value_file) prop_value_regex = '(:\s*|^\s*)(' + "|".join(value.replace('-webkit-', '') for value in self.prefixed_property_values) + ')(\s*;|\s*}|\s*$)' self.prop_value_re = re.compile(prop_value_regex)
def __init__(self): HTMLParser.__init__(self) self.in_records_table = False self.record = -1 self.column = -1 self.data_row = False self.data = []
def __init__(self, feed_data): HTMLParser.__init__(self) self.courses = tuple() self.is_course = False self.is_coursename = False self.is_homework = False self.feed(feed_data)
def __init__(self): HTMLParser.__init__(self) self.subjectList = {} self.tagi = 0 self.tdi = 0 self.dataFlag = 0 self.subName = ""
def getImageLocation(comicRequest): titleString = 'id="ctitle">' captionString = 'title="' imageString = '//imgs.xkcd.com/comics/' response = urllib2.urlopen(parseComicRequest(comicRequest)) html = response.read() titleStart = html.find(titleString) + len(titleString) titleEnd = html[titleStart:].find('<') + titleStart title = html[titleStart:titleEnd] imageAddressStart = html.find(imageString) imageAddressEnd = html[imageAddressStart:].find('"') + imageAddressStart imageAddress = html[imageAddressStart:imageAddressEnd] captionStart = ( html[imageAddressEnd:].find(captionString) + imageAddressEnd + len(captionString) ) captionEnd = html[captionStart:].find('"') + captionStart caption = html[captionStart:captionEnd] parser = HTMLParser() caption = parser.unescape(caption) title = parser.unescape(title) return '*' + title + "*\nhttp:" + str(imageAddress) + '\n' + caption
def __init__(self, tag="a", attr="href", process=None, unique=False): HTMLParser.__init__(self) self.scan_tag = tag if callable(tag) else lambda t: t == tag self.scan_attr = attr if callable(attr) else lambda a: a == attr self.process_attr = process if callable(process) else lambda v: v self.unique = unique
def __init__(self): HTMLParser.__init__(self) self.in_div = False self.in_a = False self.pattern = re.compile(r'(.*)\((.*)\)') self.tangshi_list = [] self.current_poem = {}
def __init__(self): HTMLParser.__init__(self) self.glink = False self.elink = False self.ingroup = [] self.href = '' self.name = ''
def __init__(self): HTMLParser.__init__(self) self.foundName = False self.foundDescription = False self.foundPrice = False self.foundScore = True self.gameInfo = {}
def __init__(self, builder=None, encoding=None): self.__stack = [] if builder is None: builder = ElementTree.TreeBuilder() self.__builder = builder self.encoding = encoding or "iso-8859-1" HTMLParser.__init__(self)
def __init__(self): #super(formParser, self).__init__() HTMLParser.__init__(self) self.dict = {} self.stack = [] self.post="" pass
def update_event_description(event_id, description, analyst): """ Update event description. :param event_id: The ObjectId of the Event to update. :type event_id: str :param description: The new description. :type description: str :param analyst: The user updating this Event. :type analyst: str :returns: dict with keys "success" (boolean) and "message" (str) """ if not description: return {'success': False, 'message': "No description to change"} event = Event.objects(id=event_id).first() if not event: return {'success': False, 'message': "No event found"} # Have to unescape the submitted data. Use unescape() to escape # < and friends. Use urllib2.unquote() to escape %3C and friends. h = HTMLParser() description = h.unescape(description) event.description = description try: event.save(username=analyst) return {'success': True} except ValidationError, e: return {'success': False, 'message': e}
def feed(self, data): no_cc = u'no closed captioning available' if u'<html' in data.lower(): raise CaptionReadSyntaxError(u'SAMI File seems to be an HTML file.') elif no_cc in data.lower(): raise CaptionReadSyntaxError(u'SAMI File contains "%s"' % no_cc) # try to find style tag in SAMI try: # prevent BS4 error with huge SAMI files with unclosed tags index = data.lower().find(u"</head>") self.styles = self._css_parse( BeautifulSoup(data[:index]).find(u'style').get_text()) except AttributeError: self.styles = {} # fix erroneous italics tags data = data.replace(u'<i/>', u'<i>') # fix awkward tags found in some SAMIs data = data.replace(u';>', u'>') try: HTMLParser.feed(self, data) except HTMLParseError as e: raise CaptionReadSyntaxError(e) # close any tags that remain in the queue while self.queue != deque([]): closing_tag = self.queue.pop() self.sami += u"</%s>" % closing_tag return self.sami, self.styles, self.langs
def __init__(self, *a, **kw): self.indent = ''; HTMLParser.__init__(self, *a, **kw) self.processed_text = '' self.tagtracker = [] self.error_line = 0 self.line_number = 1
def __init__(self, pdf): HTMLParser.__init__(self) self.style = {} self.pre = False self.href = '' self.align = '' self.page_links = {} self.font_list = ("times","courier", "helvetica") self.font = None self.font_stack = [] self.pdf = pdf self.r = self.g = self.b = 0 self.indent = 0 self.bullet = [] self.set_font("times", 12) self.font_face = "times" # initialize font self.color = 0 #initialize font color self.table = None # table attributes self.table_col_width = None # column (header) widths self.table_col_index = None # current column index self.td = None # cell attributes self.th = False # header enabled self.tr = None self.theader = None # table header cells self.tfooter = None # table footer cells self.thead = None self.tfoot = None self.theader_out = self.tfooter_out = False
def __init__(self): HTMLParser.__init__(self) self.title = False self.updated = False self.titlestr = '' self.updatedstr = '' self.list = []
def feed(self, data): from HTMLParser import HTMLParser data_with_br = data.replace("\n", "<br/>") HTMLParser.feed(self, data_with_br) if len(self.current_line) > 0: self.lines.append(self.current_line) self.current_line = ''
def __init__(self, url): """Returns new Sequence object with specified url url: link to mp3.zing.vn web page """ HTMLParser.__init__(self) self.song_name = [] self.song_artist = [] self.song_link = [] self.song_type = [] req = urlopen(url) # open connection to web page data = None if req.info().get('Content-Encoding') == "gzip": buf = StringIO( req.read()) f = gzip.GzipFile(fileobj=buf) data = f.read().split("\n") else: data = req.read().split("\n") # split web page with \n feed_data = None for param in data: if (param.find('<param name="flashvars" value="') > -1): """Find line to get xml url """ feed_data = param break self.feed(feed_data) # parser html data
def insert_to(project_url, destination, find_what, indent=0): url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/') response = urllib2.urlopen(url) if response.getcode() == 200: with open(destination, 'r') as dest: dest_contents = dest.readlines() lines = ''.join(dest_contents) content = HTMLParser().unescape(response.read()) if content.replace(' ', '') in lines.replace(' ', ''): print_out('IGNORED', destination) return generated = [] for line in dest_contents: generated.append(line) if line.lower().find(find_what.lower()) >= 0: spaces = len(line) - len(line.lstrip()) for l in content.split('\n'): if l: generated.append('%s%s\n' % (' ' * (spaces + indent), l)) with open(destination, 'w') as dest: for line in generated: dest.write(line) print_out('INSERT', destination)
def __init__(self): HTMLParser.__init__(self) self.map = {} self.map_flag = False self.list = [] self.list_flag = False
import csv from HTMLParser import HTMLParser import io books = [] authors = {} book_authors = [] author_id = 1 publishers = {} book_publishers = [] publisher_id = 1 h = HTMLParser() with open('books.csv', 'rb') as csvfile: #dialect = csv.Sniffer().sniff(csvfile.read()) #csvfile.seek(0) #reader = csv.reader(csvfile, dialect) reader = csv.reader(csvfile, delimiter='\t') first_line = True for row in reader: if (first_line): first_line = False continue books.append([ row[0], h.unescape(row[2].decode('utf-8').strip()).encode('utf-8'), row[4], row[6] ]) auths = row[3] current_book_authors = [] for auth in auths.split(','): auth = h.unescape(auth.decode('utf-8').strip())
def __init__(self): HTMLParser.__init__(self) self.toc = [] self.page_title = None self._recent_tag = None self._current_heading = {}
def __init__(self,target="viewcourses"): HTMLParser.__init__(self) self.target=target self.flag=False self.payload={}
def __init__(self): hp.__init__(self) self.links =[]
# -*- coding: utf-8 -*- import re from HTMLParser import HTMLParser from django.utils.html import strip_tags h = HTMLParser() # list of words that aren't judge names NOT_JUDGE = [ "above", "absent", "acting", "active", "adopted", "affirm", "after", "agrees", "all", "although", "and", "affirmed", "appeals", "appellate", "argument", "argued", "arj", "ass", "assign", "assigned",
def __init__(self): HTMLParser.__init__(self) self.path = [] self.title = [] self.message = []
def __init__(self): """Initialize the parser.""" HTMLParser.__init__(self) self.stable_version = Version('0.0.0') self.devel_version = Version('0.0.0')
def update_wiki_tracker(self, comment): """ Update wiki page of person earning the delta Note: comment passed in is the comment awarding the delta, parent comment is the one earning the delta """ logging.info("Updating wiki") comment_url = comment.permalink submission_url = comment.submission.permalink submission_title = comment.submission.title parent = self.reddit.get_info(thing_id=comment.parent_id) parent_author = parent.author.name author_flair = str(self.subreddit.get_flair(parent_author)) author_flair = re.search("(flair_text': u')(\d*)", author_flair) flair_count = "0 deltas" if author_flair: flair_count = author_flair.group(2) if flair_count == "1": flair_count = "1 delta" else: flair_count += " deltas" if comment.author: awarder_name = comment.author.name else: return # Skips, in case the parent comment is deleted. today = datetime.date.today() # try to get wiki page for user, throws exception if page doesn't exist try: user_wiki_page = self.reddit.get_wiki_page(self.config.subreddit, "user/" + parent_author) # get old wiki page content as markdown string, and unescaped any # previously escaped HTML characters old_content = HTMLParser().unescape(user_wiki_page.content_md) # Alter how many deltas is in the first line try: old_content = re.sub("([0-9]+) delta[s]?", flair_count, old_content) except: print( "The 'has received' line in the wiki has failed to update." ) # compile regex to search for current link formatting # only matches links that are correctly formatted, so will not be # broken by malformed or links made by previous versions of DeltaBot regex = re.compile( "\\* \\[%s\\]\\(%s\\) \\(\d+\\)" % (re.escape(submission_title), re.escape(submission_url))) # search old page content for link old_link = regex.search(old_content) # variable for updated wiki content new_content = "" # old link exists, only increase number of deltas for post if old_link: # use re.sub to increment number of deltas in link new_link = re.sub( "\((\d+)\)", lambda match: "(" + str(int(match.group(1)) + 1) + ")", old_link.group(0)) # insert link to new delta new_link += "\n 1. [Awarded by /u/%s](%s) on %s/%s/%s" % ( awarder_name, comment_url + "?context=3", today.month, today.day, today.year) #use re.sub to replace old link with new link new_content = re.sub(regex, new_link, old_content) # no old link, create old link with initial count of 1 else: # create link and format as markdown list item # "?context=2" means link shows comment earning the delta and # the comment awarding it # "(1)" is the number of deltas earned from that comment # (1 because this is the first delta the user has earned) add_link = "\n\n* [%s](%s) (1)\n 1. [Awarded by /u/%s](%s) on %s/%s/%s" % ( submission_title, submission_url, awarder_name, comment_url + "?context=2", today.month, today.day, today.year) # get previous content as markdown string and append new content new_content = user_wiki_page.content_md + add_link # overwrite old content with new content self.reddit.edit_wiki_page(self.config.subreddit, user_wiki_page.page, new_content, "Updated delta links.") # if page doesn't exist, create page with initial content except: # create header for new wiki page initial_text = "/u/%s has received a request point for the following comments:" % parent_author # create link and format as markdown list item # "?context=2" means link shows comment earning the delta and the comment awarding it # "(1)" is the number of deltas earned from that comment # (1 because this is the first delta the user has earned) add_link = "\n\n* [%s](%s) (1)\n 1. [Awarded by /u/%s](%s) on %s/%s/%s" % ( submission_title, submission_url, awarder_name, comment_url + "?context=2", today.month, today.day, today.year) # combine header and link full_update = initial_text + add_link # write new content to wiki page self.reddit.edit_wiki_page(self.config.subreddit, "user/" + parent_author, full_update, "Created user's delta links page.") """Add new awardee to Delta Tracker wiki page""" # get delta tracker wiki page delta_tracker_page = self.reddit.get_wiki_page( self.config.subreddit, "delta_tracker") # retrieve delta tracker page content as markdown string delta_tracker_page_body = delta_tracker_page.content_md # create link to user's wiki page as markdown list item new_link = "\n\n* /u/%s -- [Delta List](/r/%s/wiki/%s)" % ( parent_author, self.config.subreddit, parent_author) # append new link to old content new_content = delta_tracker_page_body + new_link # overwrite old page content with new page content self.reddit.edit_wiki_page(self.config.subreddit, "delta_tracker", new_content, "Updated tracker page.")
def run(self): self.progressbar_show.emit(True) self.info_label.emit( translate("AddonsInstaller", "Retrieving description...")) if len(self.macros[self.idx]) > 2: desc = self.macros[self.idx][2] url = self.macros[self.idx][4] else: mac = self.macros[self.idx][0].replace(" ", "_") mac = mac.replace("&", "%26") mac = mac.replace("+", "%2B") url = "https://www.freecadweb.org/wiki/Macro_" + mac self.info_label.emit("Retrieving info from " + str(url)) if ctx: u = urllib2.urlopen(url, context=ctx) else: u = urllib2.urlopen(url) p = u.read() if sys.version_info.major >= 3 and isinstance(p, bytes): p = p.decode("utf-8") u.close() code = re.findall("<pre>(.*?)<\/pre>", p.replace("\n", "--endl--")) if code: # code = code[0] # take the biggest code block code = sorted(code, key=len)[-1] code = code.replace("--endl--", "\n") else: self.info_label.emit( translate("AddonsInstaller", "Unable to fetch the code of this macro.")) self.progressbar_show.emit(False) self.stop = True return desc = re.findall( "<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace("\n", " ")) if desc: desc = desc[0] else: self.info_label.emit( translate( "AddonsInstaller", "Unable to retrieve a description for this macro.")) desc = "No description available" # clean HTML escape codes try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser try: code = code.decode("utf8") code = HTMLParser().unescape(code) code = code.encode("utf8") code = code.replace("\xc2\xa0", " ") except: FreeCAD.Console.PrintWarning( translate("AddonsInstaller", "Unable to clean macro code: ") + mac + "\n") self.update_macro.emit(self.idx, self.macros[self.idx] + [desc, code, url]) if self.macros[self.idx][1] == 1: message = "<strong>" + translate( "AddonsInstaller", "<strong>This addon is already installed." ) + "</strong><br>" + desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>' else: message = desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>' self.info_label.emit(message) self.progressbar_show.emit(False) self.stop = True
def decode_html_entities(df): print('Decoding HTML entities...') h = HTMLParser() df['body'] = df['body'].apply(lambda row: h.unescape(row)) return df
def __init__(self, sanitizationLevel=_defaultSanitizationLevel): HTMLParser.__init__(self) if sanitizationLevel not in range(0, 3): sanitizationLevel = self._defaultSanitizationLevel self._sanitizationLevel = sanitizationLevel
def print_info(self, req, req_body, res, res_body): def parse_qsl(s): return '\n'.join( "%-20s %s" % (k, v) for k, v in urlparse.parse_qsl(s, keep_blank_values=True)) req_header_text = "%s %s %s\n%s" % (req.command, req.path, req.request_version, req.headers) res_header_text = "%s %d %s\n%s" % (res.response_version, res.status, res.reason, res.headers) print with_color(33, req_header_text) u = urlparse.urlsplit(req.path) if u.query: query_text = parse_qsl(u.query) print with_color(32, "==== QUERY PARAMETERS ====\n%s\n" % query_text) cookie = req.headers.get('Cookie', '') if cookie: cookie = parse_qsl(re.sub(r';\s*', '&', cookie)) print with_color(32, "==== COOKIE ====\n%s\n" % cookie) auth = req.headers.get('Authorization', '') if auth.lower().startswith('basic'): token = auth.split()[1].decode('base64') print with_color(31, "==== BASIC AUTH ====\n%s\n" % token) if req_body is not None: req_body_text = None content_type = req.headers.get('Content-Type', '') if content_type.startswith('application/x-www-form-urlencoded'): req_body_text = parse_qsl(req_body) elif content_type.startswith('application/json'): try: json_obj = json.loads(req_body) json_str = json.dumps(json_obj, indent=2) if json_str.count('\n') < 50: req_body_text = json_str else: lines = json_str.splitlines() req_body_text = "%s\n(%d lines)" % ('\n'.join( lines[:50]), len(lines)) except ValueError: req_body_text = req_body elif len(req_body) < 1024: req_body_text = req_body if req_body_text: print with_color( 32, "==== REQUEST BODY ====\n%s\n" % req_body_text) print with_color(36, res_header_text) cookie = res.headers.get('Set-Cookie', '') if cookie: cookie = parse_qsl(re.sub(r';\s*', '&', cookie)) print with_color(31, "==== SET-COOKIE ====\n%s\n" % cookie) if res_body is not None: res_body_text = None content_type = res.headers.get('Content-Type', '') if content_type.startswith('application/json'): try: json_obj = json.loads(res_body) json_str = json.dumps(json_obj, indent=2) if json_str.count('\n') < 50: res_body_text = json_str else: lines = json_str.splitlines() res_body_text = "%s\n(%d lines)" % ('\n'.join( lines[:50]), len(lines)) except ValueError: res_body_text = res_body elif content_type.startswith('text/html'): m = re.search(r'<title[^>]*>([\s\S]+?)</title>', res_body, re.I) if m: h = HTMLParser() print with_color( 32, "==== HTML TITLE ====\n%s\n" % h.unescape(m.group(1).decode('utf-8'))) elif content_type.startswith('text/') and len(res_body) < 1024: res_body_text = res_body if res_body_text: print with_color( 32, "==== RESPONSE BODY ====\n%s\n" % res_body_text)
def __init__(self): HTMLParser.__init__(self) self.pageId = None self.pageTitle = None self.shortURL = None self.dest = None
def extract_sentences(self, mode="split", source="fulltext"): ''' Finds sentence boundaries and saves them as sentence objects in the attribute "sentences" as a list of Sentence objects. Parameters ---------- mode : str, optional, default = "split" Split the sentences ("split") or use the whole "source" as a single sentence ("no-split"). Useful for developing and debugging. source : str, optional, default = "fulltext" Use the "fulltext" or the "abstract" to extract sentences. ''' text = "" if source == "fulltext": text = str(self.fulltext) else: text = str(self.abstract) if mode == "no-split": # Don't try to separate the sentence. # Everything in the text is just one sentence! self.sentences.append(Sentence(originaltext=text)) else: caps = "([A-Z])" prefixes = "(Mr|Fig|fig|St|Mrs|Ms|Dr)[.]" digits = "([0-9])" fig_letters = "([A-Ka-k])" suffixes = "(Inc|Ltd|Jr|Sr|Co)" starters = r"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" websites = "[.](com|net|org|io|gov)" species = r"([A-Z])[.] ?([a-z]+)" text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + caps + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(caps + "[.]" + caps + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + caps + "[.]", " \\1<prd>", text) text = re.sub(digits + caps + "[.]", " \\1<prd>", text) text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text) text = re.sub(digits + "[.]" + fig_letters, "\\1<prd>\\2", text) text = re.sub(species, "\\1<prd> \\2", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") if "e.g." in text: text = text.replace("e.g.", "e<prd>g<prd>") if "i.e." in text: text = text.replace("i.e.", "i<prd>e<prd>") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") #sentences = sentences[:-1] sentences = [s.strip() for s in sentences] h = HTMLParser() for sentence in sentences: sentence = str(h.unescape(sentence)) if not sentence.strip() or not isinstance(sentence, str): continue self.sentences.append(Sentence(originaltext=sentence))
def __init__(self): HTMLParser.__init__(self) self.__text = []
def __init__(self): HTMLParser.__init__(self) self.item = Commodity() self.state = 0
def strip_tags(html): parser = HTMLParser() html = parser.unescape(html) s = MLStripper() s.feed(html) return s.get_data()
return default, [first] encoding = find_cookie(second) if encoding: return encoding, [first, second] return default, [first, second] # For converting & <-> & etc. try: from html import escape except ImportError: from cgi import escape if sys.version_info[:2] < (3, 4): unescape = HTMLParser().unescape else: from html import unescape try: from collections import ChainMap except ImportError: # pragma: no cover from collections import MutableMapping try: from reprlib import recursive_repr as _recursive_repr except ImportError: def _recursive_repr(fillvalue='...'): ''' Decorator to make a repr function return fillvalue for a recursive
def __init__(self): HTMLParser.__init__(self) self.word2id = {} self.article_id = None self.word_pos = None
def __init__(self): HTMLParser.__init__(self) self.tag_results = {}
def reset(self): HTMLParser.reset(self) self.state = 0
def __init__(self): HTMLParser.__init__(self) self.link = None
#!/usr/bin/env python import re import requests from requests_kerberos import HTTPKerberosAuth, OPTIONAL import subprocess import socket import json import univention.testing.utils as utils import univention.config_registry as configRegistry from HTMLParser import HTMLParser html = HTMLParser() class SamlError(Exception): """Custom error for everything SAML related""" def __init__(self, msg): self.message = msg def __str__(self): return repr(self.message) class SamlLoginError(SamlError): def __init__(self, page): self.page = page self.message = '' self._error_evaluation() def _error_evaluation(self):
def close(self): HTMLParser.close(self) return self.__builder.close()
def __init__(self): HTMLParser.__init__(self) self.result = []
def __init__(self): self.links = {} f = formatter.NullFormatter() HTMLParser.__init__(self, f)
def __init__(self, strings): HTMLParser.__init__(self) self.strings = strings self.hit_end_tag = False self.in_no_split_tag = False self.text = ''
def __init__(self): HTMLParser.__init__(self) self.result_info = [] self.link = '' self.title = ''
def __init__(self): self.__stack = [] self.__builder = ElementTree.TreeBuilder() HTMLParser.__init__(self)