def do_POST(self): content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) post_data_xml = BeautifulSoup(post_data, "xml") data = None logging.debug( "POST Request,\nPath: {path}\nHeaders:\n{headers}\n\nBody:\n{body}\n" .format(path=self.path, headers=self.headers, body=post_data_xml.encode_contents())) soap_action = self.headers['SOAPAction'] if soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/ClientWebService/GetConfig"': # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/b76899b4-ad55-427d-a748-2ecf0829412b data = BeautifulSoup(update_handler.get_config_xml, 'xml') elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/ClientWebService/GetCookie"': # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/36a5d99a-a3ca-439d-bcc5-7325ff6b91e2 data = BeautifulSoup(update_handler.get_cookie_xml, "xml") elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/ClientWebService/SyncUpdates"': # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/6b654980-ae63-4b0d-9fae-2abb516af894 data = BeautifulSoup(update_handler.sync_updates_xml, "xml") elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/ClientWebService/GetExtendedUpdateInfo"': # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/862adc30-a9be-4ef7-954c-13934d8c1c77 data = BeautifulSoup(update_handler.get_extended_update_info_xml, "xml") elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/ReportEventBatch"': # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/da9f0561-1e57-4886-ad05-57696ec26a78 data = BeautifulSoup(update_handler.report_event_batch_xml, "xml") elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/SimpleAuthWebService/GetAuthorizationCookie"': # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/44767c55-1e41-4589-aa01-b306e0134744 data = BeautifulSoup(update_handler.get_authorization_cookie_xml, "xml") else: logging.warning("SOAP Action not handled") logging.info('SOAP Action: {}'.format(soap_action)) return self._set_response() self.wfile.write(data.encode_contents()) logging.info('SOAP Action: {}'.format(soap_action)) if data is not None: logging.debug( "POST Response,\nPath: {path}\nHeaders:\n{headers}\n\nBody:\n{body}\n" .format(path=self.path, headers=self.headers, body=data.encode_contents)) else: logging.warning("POST Response without data.")
def process_raw_content(raw_content): """ Processes a markdown-formatted string, returning a dict that can be used to populate an Article instance :param raw_content: markdown string :return: :rtype: dict """ data = {} # Since we already have BeautifulSoup in the requirements, it makes sense to leverage it here. data['full_rendered_content'] = markdown(raw_content) soup = BeautifulSoup(data['full_rendered_content']) try: data['title'] = soup.find('h1').extract().encode_contents() except AttributeError: # Element not found data['title'] = '' # Markdown seems to add a paragraph and extra linebreaks inside blockquotes for some reason; in pre_v1 any HTML # was skipped, but it was decided to so we'll do the same here, and we'll remove the linebreaks too. try: data['punchline'] = soup.find('blockquote').extract().find( 'p').encode_contents().strip() except AttributeError: data['punchline'] = '' try: # Slightly more complex: we need to find the first H2, and extract the first P before it data['description'] = soup.find('h2').find_previous( 'p').extract().encode_contents() except AttributeError: data['description'] = '' data['rendered_html'] = soup.encode_contents() return data
def process(toroot, html): soup = BeautifulSoup(html, 'html.parser') try: subTitle = soup.find(class_='header').find(class_='subTitle') link = soup.new_tag('a', href='package-summary.html') link.string = subTitle.encode_contents(formatter='html') backIcon = soup.new_tag('i', **{'class':'material-icons'}) backIcon.string = 'arrow_back' link.insert(0, backIcon) subTitle.clear() subTitle.append(link) except: pass prettyprints = soup.find_all('pre', class_='prettyprint') for p in prettyprints: p.string = re.sub(r'\s+$', '', p.string, re.M | re.S | re.I) soup.head.append(soup.new_tag('link', rel='stylesheet', href='http://fonts.googleapis.com/css?family=Roboto:400,700,300|Roboto+Mono')) soup.head.append(soup.new_tag('link', rel='stylesheet', href='https://fonts.googleapis.com/icon?family=Material+Icons')) soup.head.append(soup.new_tag('link', rel='stylesheet', href=toroot + 'resources/prettify.css')) soup.head.append(soup.new_tag('link', rel='stylesheet', href=toroot + 'resources/javadoc_stylesheet.css')) soup.head.append(soup.new_tag('script', src=toroot + 'resources/prettify.js')) if soup.body: script = soup.new_tag('script') script.string = 'prettyPrint();' soup.body.append(script) return soup.encode_contents(formatter='html')
def process_raw_content(raw_content): """ Processes a markdown-formatted string, returning a dict that can be used to populate an Article instance :param raw_content: markdown string :return: :rtype: dict """ data = {} # Since we already have BeautifulSoup in the requirements, it makes sense to leverage it here. data['full_rendered_content'] = markdown(raw_content) soup = BeautifulSoup(data['full_rendered_content']) try: data['title'] = soup.find('h1').extract().encode_contents() except AttributeError: # Element not found data['title'] = '' # Markdown seems to add a paragraph and extra linebreaks inside blockquotes for some reason; in pre_v1 any HTML # was skipped, but it was decided to so we'll do the same here, and we'll remove the linebreaks too. try: data['punchline'] = soup.find('blockquote').extract().find('p').encode_contents().strip() except AttributeError: data['punchline'] = '' try: # Slightly more complex: we need to find the first H2, and extract the first P before it data['description'] = soup.find('h2').find_previous('p').extract().encode_contents() except AttributeError: data['description'] = '' data['rendered_html'] = soup.encode_contents() return data
def convert_text_to_html(input_str): # convert newlines to line breaks input_str = "<p>" + input_str + "</p>" soup = BeautifulSoup(input_str, "html.parser") input_str = soup.encode_contents(encoding="utf8").decode("utf8") input_str = input_str.replace("\n", "<br/>") return clean_html(input_str, strip_unsafe=True)
def get_item_info(url): wed_data = requests.get(url, headers=header) soup = BeautifulSoup(wed_data.text, 'lxml') soup.encode_contents(encoding='utf-8') no_longer_exitst = '404' in soup.find( 'script', type='text/javascript').get('src').split('/') if no_longer_exitst: pass else: title = soup.title.text price = soup.select('span.price.c_f50')[0].text print type(price) date = soup.select('.time')[0].text area = list( soup.select('c_25d a')[0].stripped_strings) if soup.find_all( 'span', 'c_25d') else None # item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area}) print {'title': title, 'price': price, 'date': date, 'area': area}
def clean_html(html, strip_unsafe=False): """ clean a html string if strip_unsave is set, potentially malicious tags (defined in ``settings.REMOVE_WITH_CONTENT``) are also removed :param html: the input HTML string that needs to be cleaned :type html: basestring :param strip_unsafe: :type strip_unsafe: bool :return: cleaned html :rtype: basestring """ if not html: return "" doc = BeautifulSoup(html, "html.parser") if strip_unsafe: for tag in doc.find_all(True): if tag.name not in getattr(settings, 'ACCEPTABLE_ELEMENTS', tuple()): logger.warning( "Found tag {} which is not in the ACCEPTABLE_ELEMENTS setting".format(tag.name) ) if tag.name in getattr(settings, 'REMOVE_WITH_CONTENT', tuple()): tag.decompose() else: tag.unwrap() try: for attr in tag.attrs.keys(): # strip all tags that are not in acceptable attributes if attr not in getattr(settings, 'ACCEPTABLE_ATTRIBUTES', tuple()): logger.warning( "Removing attribute {} of tag {} as it is not listed in the " "ACCEPTABLE_ATTRIBUTES settings".format(attr, tag.name) ) del tag[attr] continue # special cases for attributes style and href if attr == 'style': tag[attr] = clean_styles(tag[attr]) elif attr == 'href': tag[attr] = clean_hrefs(tag[attr]) except: pass # ToDo: Check if we need to be python2 compatible with that # doc = unicode(doc) # try: # if HTMLField.EMPTY_HTML_REGEXP.match(doc.encode("UTF-8")): # return u"" # except: # pass # encode the result with beautifulsoups html converter, thus "keeping" as (instead of \xa0) return doc.encode_contents(formatter='html').decode()
def fix_description(description): soup = BeautifulSoup(description, 'html.parser') match = soup.findAll('script') if match: for m in match: m.decompose() match2 = soup.findAll('o:p') if match2: for m in match2: m.decompose() comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] if soup.find('img', {'src': 'http://freeauctiondesigns.com/ebay/templates/green_white_swirls/top.gif'}): return soup.encode_contents(formatter='html').decode('utf-8') else: return BeautifulSoup(border.format(soup.encode_contents(formatter='html')), 'html.parser').encode_contents(formatter='html').decode('utf-8')
def download_page(url, target): html = urllib2.urlopen(url).read() soup = BeautifulSoup(html, 'lxml') for elem in soup.find_all(): if elem.get('src', None): elem['src'] = relative_to_absolute(url, elem['src']) if elem.get('href', None): elem['href'] = relative_to_absolute(url, elem['href']) with open(target, "w") as f: f.write(soup.encode_contents())
def parse_article(content): soup = BeautifulSoup(content, 'lxml') for tag in soup.find_all(): if tag.name == 'a' and tag.attrs.get('href') and tag.text.strip(): tag.attrs = {'href': tag.attrs['href']} else: tag.unwrap() return soup.encode_contents().decode('utf-8').strip()
def scrub(file_name, flag): soup = BeautifulSoup(open(file_name), "html5lib") for node in soup.find_all(class_=flag): node.extract() new_html = soup.encode_contents(formatter="html") with open(file_name, "wb") as file: file.write(new_html) return
def clean_html(html, strip_unsafe=False): """ clean a html string if strip_unsave is set, potentially malicious tags (defined in ``settings.REMOVE_WITH_CONTENT``) are also removed :param html: the input HTML string that needs to be cleaned :type html: basestring :param strip_unsafe: :type strip_unsafe: bool :return: cleaned html :rtype: basestring """ if not html: return "" doc = BeautifulSoup(html, "html.parser") if strip_unsafe: for tag in doc.find_all(True): if tag.name not in getattr(settings, 'ACCEPTABLE_ELEMENTS', tuple()): logger.warning( "Found tag {} which is not in the ACCEPTABLE_ELEMENTS setting" .format(tag.name)) if tag.name in getattr(settings, 'REMOVE_WITH_CONTENT', tuple()): tag.decompose() else: tag.unwrap() try: for attr in tag.attrs.keys(): # strip all tags that are not in acceptable attributes if attr not in getattr(settings, 'ACCEPTABLE_ATTRIBUTES', tuple()): logger.warning( "Removing attribute {} of tag {} as it is not listed in the " "ACCEPTABLE_ATTRIBUTES settings".format( attr, tag.name)) del tag[attr] continue # special cases for attributes style and href if attr == 'style': tag[attr] = clean_styles(tag[attr]) elif attr == 'href': tag[attr] = clean_hrefs(tag[attr]) except: pass return doc.encode_contents(formatter='html').decode()
def _strip_tags(self, html, invalid_tags=['em', 'a', 'span', 'strong', 'div', 'p']): soup = BeautifulSoup(html, "html.parser") for tag in soup.find_all(True): if tag.name in invalid_tags: s = "" for c in tag.contents: if not isinstance(c, NavigableString): c = self._strip_tags(unicode(c), invalid_tags) s += unicode(c).strip() else: s += unicode(c) tag.replace_with(s) return soup.encode_contents().decode('UTF-8')
def _scrapeHomeAndGetLinks(self,home): soup_home=BeautifulSoup(home,'html.parser') soup_home.encode_contents(encoding='utf-8') #get the 1/3 column slices containing the major url links we want selectionColumns=[s for s in map(lambda t: t.encode('utf-8'),soup_home.body.find_all('div',{'class': 'one-third'}))] #grab the major category of link from each 1/3 column slice def _getHomeHyperLinks(col): greaterCat=BeautifulSoup(col,'html.parser').find_all('span') catAnchor=[] for cat in greaterCat: catAnchor.extend([anchor['href'] for anchor in cat.find_all('a')]) return catAnchor list_of_urls=(_getHomeHyperLinks(col) for col in selectionColumns) #flatten each list of columns of url paths url_path=functools.reduce(lambda acc,list:acc+list,list_of_urls,[]) return url_path
def replace_cid_in_html(html, mapped_attachments): if html is None: return None soup = BeautifulSoup(html) inline_images = soup.findAll('img', {'src': lambda src: src and src.startswith('cid:')}) for image in inline_images: inline_attachment = mapped_attachments.get(image.get('src')[4:]) if inline_attachment is not None: image['src'] = reverse('email_attachment_proxy_view', kwargs={'pk': inline_attachment.pk}) return soup.encode_contents()
def replace_anchors_in_html(html): """ Make all anchors open outside the iframe """ if html is None: return None soup = BeautifulSoup(html) for anchor in soup.findAll('a'): anchor.attrs.update({ 'target': '_blank', }) return soup.encode_contents()
def convert_text_to_html(input_str): """ Converts a textfield (without html) to an html field This is useful for migrations or management commands where you need to manually convert the content of a field :param input_str: :return: """ # convert newlines to line breaks input_str = "<p>" + input_str + "</p>" soup = BeautifulSoup(input_str, "html.parser") input_str = soup.encode_contents(encoding="utf8").decode("utf8") input_str = input_str.replace(u"\n", u"<br/>") return clean_html(input_str, strip_unsafe=True)
def process(content, used=False): if not content: return soup = BeautifulSoup(content.encode('utf8'), from_encoding='utf8') for tag in soup.find_all('a'): href = tag.attrs.get('href', '') if not href.startswith('http'): continue if 'img-fotki' in href or not used: tag.attrs['rel'] = 'nofollow' content = soup.encode_contents(indent_level=2).decode('utf8')\ .replace('<html><body>', '').replace('</body></html>', '') return content
def edit_page(filename): original_page = open(filename, 'r').read() soup = BeautifulSoup(original_page, 'lxml') forms = soup.find_all('form') print "[*] Found forms:" i = 0 for f in forms: print "FORM " + str(i) + " --> " + f.get('action', 'None') i += 1 while True: try: i = int(raw_input('Form to log: ')) except ValueError: print "Enter the form number" try: f = forms[i] break except IndexError: print "Invalid form number" print "Selected form " + str(i) + '\n' f['action'] = "/form" loggable = [] for i in f.find_all('input'): if i.get('name'): loggable.append(i['name']) while True: print "[*] Form fields:" for i in range(len(loggable)): print str(i) + " - " + loggable[i] input_params = raw_input( 'Fields to log (comma separated, e.g 1,4,5): ').split(',') to_log = [] try: for i in input_params: to_log.append(loggable[int(i)]) break except: print "Invalid format: use form field identifiers (e.g 1,4,5)" print 'Logging: ' + str(to_log) + '\n' with open('index.html', "w") as f: f.write(soup.encode_contents()) return to_log
def transform_links(self, url, content): parse_result = urlparse.urlparse(url) soup = BeautifulSoup(content) for link in soup.find_all(True): href = None for name in ['href', 'src']: if name in link.attrs: href = link.attrs[name] break if href is None: continue new_href = self.get_absolute_url(parse_result.path, link.attrs[name]) if new_href is not None: link.attrs[name] = new_href return soup.encode_contents(formatter='html')
def parse_page(self): page = self.read_url(self.original_url) soup = BeautifulSoup(page, "lxml") css = soup.find_all('link', {'rel': 'stylesheet'}) js = soup.find_all('script') images = soup.find_all('img') styles = soup.find_all('style') inline_styles = soup.find_all( attrs={ 'style': re.compile( "(?:background|background-image):(?:[ ]+|)(?:[\#\w\d]*|)(?:[ ]+|)url\((.*?)\)" ) }) formsDetection = FormsDetection(soup, self.url) formsDetection.replace() for i in images: if i.get('src'): i['src'] = self.parse_image(i['src']) for j in js: if j.get('src'): j['src'] = self.parse_javascript(j['src']) j['type'] = 'text/javascript' for _c in css: if _c.get('href'): _c['href'] = self.parse_css(_c['href']) _c['type'] = 'text/css' for s in styles: s.string = self.parse_css_text(s.string) for _is in inline_styles: _is.attrs['style'] = self.parse_css_text(_is.attrs['style']) return self.write_file(soup.encode_contents(), 'throwaway_dirname', 'html', 'w')
def metadata_for_papers(paper_ids, outfile): fout = open(outfile, "w") fout.write("paper_id{0}pubmed_id{0}author_ids\n".format(DELIM)) for i, paper_id in enumerate(paper_ids): # Counter if i % 5 == 0: print i # Execute query, convert result to Soup format #try: res = requests.get(SCOPUS_QUERY.format(paper_id)) soup = BeautifulSoup(res.content) # get author list authors_section = soup.find(id="authorlist").encode_contents() author_list = re.findall(r"\?authorId=(.*?)\&", authors_section) authors_str = DELIM.join(author_list) # get pubmed ID try: # assume there's only one pubmed_id = re.findall(r"\"View in PubMed\">(.*?)<", soup.encode_contents())[0] except IndexError: # no pubmed ID print i, ":no pubmed ID" pubmed_id = "" # write to file fout.write("{1}{0}{2}{0}{3}\n".format(DELIM, paper_id, pubmed_id, authors_str)) #except: # # Can't find ID or something went wrong # fout.write("{0}\n".format(paper_id)) fout.close()
def render_element(e_id, e_type): client = MongoClient('localhost', 27017) widget = client.asktask.q_builder_widgets.find_one({'id':e_type}) code_js = '' code_block = BeautifulSoup('<div class="quest_element" id="{}"></div>'.format(e_id)) code_block.div.append(code_block.new_tag('h3')) code_block.div.h3.append(code_block.new_tag('i', **{'class':'fa fa-{}'.format(widget['icon'])})) code_block.div.h3.append(' '+widget['title']) code_block.div.append(code_block.new_tag('div')) #code_block.div.div.append(code_block.new_tag('p')) #code_block.div.div.p.append(widget['description']) if 'form' in widget: form = code_block.new_tag('form', **{'class':'widget_settings pure-form'}) for field in widget['form']: if field['type'] == 'textarea': form.append(code_block.new_tag('textarea', **{'placeholder':field['name']})) elif field['type'] == 'select': sel_el = code_block.new_tag('select') for opt in field['options']: o = code_block.new_tag('option') o.append(opt) sel_el.append(o) form.append(sel_el) elif field['type'] == 'spinner': form.append(code_block.new_tag('input', **{'class':'spinner','id':'{}-{}'.format(e_id, field['id']), 'value':field['value']})) js_opt = [] if 'min' in field.keys(): js_opt.append('min: {}'.format(field['min'])) if 'max' in field.keys(): js_opt.append('max: {}'.format(field['max'])) code_js += '$( "#{}-{}" ).spinner({{\n'.format(e_id, field['id']) code_js += ',\n'.join(js_opt) code_js += '});\n' else: field['id'] = '{}-{}'.format(e_id, field['id']) form.append(code_block.new_tag('input', **{k:v for k,v in field.items()})) code_block.div.div.append(form) return code_block.encode_contents(formatter=None), code_js
def metadata_for_papers(paper_ids, outfile): fout = open(outfile,"w") fout.write("paper_id{0}pubmed_id{0}author_ids\n".format(DELIM)) for i, paper_id in enumerate(paper_ids): # Counter if i % 5 == 0: print i # Execute query, convert result to Soup format #try: res = requests.get(SCOPUS_QUERY.format(paper_id)) soup = BeautifulSoup(res.content) # get author list authors_section = soup.find(id="authorlist").encode_contents() author_list = re.findall(r"\?authorId=(.*?)\&", authors_section) authors_str = DELIM.join(author_list) # get pubmed ID try: # assume there's only one pubmed_id = re.findall(r"\"View in PubMed\">(.*?)<", soup.encode_contents())[0] except IndexError: # no pubmed ID print i, ":no pubmed ID" pubmed_id = "" # write to file fout.write("{1}{0}{2}{0}{3}\n".format(DELIM, paper_id, pubmed_id, authors_str )) #except: # # Can't find ID or something went wrong # fout.write("{0}\n".format(paper_id)) fout.close()
def parse_cms_template(html, cms_context, parent_namespace='', public=False, request=dum_request, template_context=None, using=None): """ Refer to tests for cms syntax :param html: Html to be parsed using cms syntax :type html: str :param cms_context: Dictionary that is to be used to parse the cms attributes in template :type cms_context: dict :param parent_namespace: Namespace of the html content to be parsed (if any) :type parent_namespace: str :param public: Renders the page for public usage :type public: bool :param request: Request object to be used for template context :param template_context: Template context to be used for rendering the base and included templates :type template_context: dict :param using: Template engine used to render the final template :rtype : str """ soup = BeautifulSoup(html, features=HTML_PARSER) for tag in soup.find_all(attrs={INCLUDE_TAG: include_html_re}): namespace = get_namespace(tag, parent_namespace=parent_namespace) include_value = tag.attrs.pop(INCLUDE_TAG) if ':' in include_value: local_namespace, default_template_name = include_value.split(':', 1) else: try: local_namespace = tag.attrs[NAMESPACE_TAG] except KeyError: raise TemplateSyntaxError( 'value of data-cms-include should be of the form ' '{namespace}:{template path}.' 'if namespace is not specified then another attribute ' 'data-cms-namespace should be defined' ) else: if not namespace_re.match(local_namespace): raise TemplateSyntaxError( '"{}" is not a valid value for {}'.format( local_namespace, NAMESPACE_TAG ) ) else: default_template_name = include_value if namespace: namespace += NAMESPACE_DELIMITER + local_namespace else: namespace = local_namespace template_name = cms_context.get(namespace, default_template_name) if template_name.endswith('.html'): template_name = template_name[:-5] try: include_template = validate_and_get_template( name=template_name, using=using ) except ValidationError: include_template = validate_and_get_template( name=default_template_name, using=using ) include_html = include_template.render(template_context, request) tag.attrs[NAMESPACE_TAG] = local_namespace if not public: tag.attrs[INCLUDE_TAG] = template_name replace_tag_content(tag=tag, content=include_html) for tag in soup.find_all(attrs={ATTR_TAG: attr_re}): _ns = get_namespace(tag, parent_namespace=parent_namespace) attrs = tag[ATTR_TAG].split('|') for attr in attrs: attr_name, key = attr.split(':', 1) key = _ns + NAMESPACE_DELIMITER + key if _ns else key if key in cms_context: tag[attr_name] = render_template_string( template_string=cms_context[key], context=template_context, request=request, using=using ) for tag in soup.find_all(attrs={CONTENT_TAG: content_re}): _ns = get_namespace(tag, parent_namespace=parent_namespace) key = tag[CONTENT_TAG] md = False if key.startswith('md:'): key = key[3:] md = True key = _ns + NAMESPACE_DELIMITER + key if _ns else key if key in cms_context or REPLACE_TAG in tag.attrs: # REPLACE_TAG will be replaced with it's content. # So, it doesn't make much sense to process it in else loop content = cms_context.get(key, '') else: content = tag.encode_contents() if not any(attr in content for attr in CMS_ATTRIBUTES): continue if any(attr in content for attr in CMS_ATTRIBUTES): content = parse_cms_template( html=content, cms_context=cms_context, parent_namespace=key, request=request, template_context=template_context, using=using ) if md: content = markdown(content, escape=False) content = render_template_string( template_string=content, context=template_context, request=request, using=using ) if public and REPLACE_TAG in tag.attrs: new_tag = BeautifulSoup(content, features=HTML_PARSER) tag.replace_with(new_tag) else: # We don't replace the tag in auth render so as to keep it editable replace_tag_content(tag=tag, content=content) # don't use soup.prettify as it will insert empty spaces inside textarea return soup.encode_contents()
def parse_cms_template(html, dictionary, parent_namespace='', publish=False, request=dum_request): """ Refer to tests for cms syntax :param html: Html to be parsed using cms syntax :type html: str :param dictionary: Dictionary that is to be used to parse the cms attributes in template :type dictionary: dict :param parent_namespace: Namespace of the html content to be parsed (if any) :type parent_namespace: str :param publish: This will hide sensitive info while rendering for public usage :type publish: bool :rtype : str """ soup = BeautifulSoup(html, features=HTML_PARSER) for tag in soup.find_all(attrs={'data-cms-include': include_html_re}): namespace = get_namespace(tag, parent_namespace=parent_namespace) include_value = tag.attrs.pop('data-cms-include') if ':' in include_value: local_namespace, default_template_name = include_value.split(':', 1) else: try: local_namespace = tag.attrs['data-cms-namespace'] except KeyError: raise TemplateSyntaxError( 'value of data-cms-include should be of the form {namespace}:{template path}' 'if namespace is not specified then another attribute data-cms-namespace should be defined' ) else: if not namespace_re.match(local_namespace): raise TemplateSyntaxError( '"{}" is not a valid value for data-cms-namespace'.format(local_namespace) ) else: default_template_name = include_value namespace += NAMESPACE_DELIMITER + local_namespace if namespace else local_namespace template_name = dictionary.get(namespace, default_template_name) if template_name.endswith('.html'): template_name = template_name[:-5] try: include_template = validate_and_get_template(template_name) except ValidationError: include_template = validate_and_get_template( default_template_name[:-5] if default_template_name.endswith('.html') else default_template_name ) include_html = include_template.render(request=request) tag.attrs['data-cms-namespace'] = local_namespace if not publish: tag.attrs['data-cms-include'] = template_name new_tag = Tag(soup, name=tag.name, attrs=tag.attrs) new_tag.insert(0, BeautifulSoup(include_html, features=HTML_PARSER)) tag.replaceWith(new_tag) # soup does not recognize the changes made in above loop unless I do this # Also do not move it inside the loop. It will mess up the variable scoping soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER) for tag in soup.find_all(attrs={'data-cms-attr': attr_re}): _ns = get_namespace(tag, parent_namespace=parent_namespace) attrs = tag['data-cms-attr'].split('|') for attr in attrs: attr_name, key = attr.split(':', 1) key = _ns + NAMESPACE_DELIMITER + key if _ns else key if key in dictionary: tag[attr_name] = dictionary[key] soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER) for tag in soup.find_all(attrs={'data-cms-content': content_re}): _ns = get_namespace(tag, parent_namespace=parent_namespace) key = tag['data-cms-content'] md = False if key.startswith('md:'): key = key[3:] md = True key = _ns + NAMESPACE_DELIMITER + key if _ns else key if key in dictionary: content = dictionary[key] else: content = tag.encode_contents() if not any(_ in content for _ in CMS_ATTRIBUTES): continue new_tag = Tag(soup, name=tag.name, attrs=tag.attrs) if any(_ in content for _ in CMS_ATTRIBUTES): content = parse_cms_template(content, dictionary, parent_namespace=key, request=request) if md: content = markdown(content, False) new_tag.insert(0, BeautifulSoup(content, features=HTML_PARSER)) tag.replace_with(new_tag) soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER) # don't use soup.prettify as it will insert empty spaces inside textarea return soup.encode_contents()
def clean_source(self, file): soup = BeautifulSoup(file.read()) for tag in self.clean_tags: for item in soup.find_all(tag): item.extract() return soup.encode_contents()
class ConfluencePageInflater(object): def __init__(self, page_source, page_handle, attach_handle, encoding='utf-8'): super(ConfluencePageInflater, self).__init__() self.soup = BeautifulSoup(page_source, 'html5lib', from_encoding=encoding) self.page_handle = page_handle self.attach_handle = attach_handle self.cleaned_up = False def filter_image(self): for img in self.soup.find_all('img'): ac_image = self.soup.new_tag('ac:image') src = img.get('src') if src and '//' not in src: attach = self.attach_handle(src, img.get('title')) if attach: ri_resource = self.soup.new_tag('ri:attachment') ri_resource['ri:filename'] = attach['resource_name'] else: img.decompose() continue else: ri_resource = self.soup.new_tag('ri:url') ri_resource['ri:value'] = src ac_image.append(ri_resource) if img.has_attr('alt'): ac_image['ac:alt'] = img['alt'] img.replace_with(ac_image) def filter_link(self): for link in self.soup.find_all('a'): href = link.get('href') if href and '//' not in href: if '?' in href: href = href[:href.index('?')] ac_link = self.soup.new_tag('ac:link') if '#' in href: ac_link['ac:anchor'] = href[href.index('#') + 1:] href = href[:href.index('#')] if href.endswith('.html'): page = self.page_handle(href) if page: ri_resource = self.soup.new_tag('ri:page') ri_resource['ri:content-title'] = page['title'] else: link.decompose() continue else: attach = self.attach_handle(href, link.get('title')) if attach: ri_resource = self.soup.new_tag('ri:attachment') ri_resource['ri:filename'] = attach['resource_name'] else: link.decompose() continue ac_link.append(ri_resource) children = link.find_all() if children: body = self.soup.new_tag('ac:link-body') for child in children: body.append(child) elif link.text: body = self.soup.new_tag('ac:plain-text-link-body') body.append(self.soup.new_string(link.text, CData)) else: link.decompose() continue if link.has_attr('title'): ac_link['ac:title'] = link['title'] ac_link.append(body) link.replaceWith(ac_link) @property def title(self): title = self.soup.find('title') return title and title.encode_contents().strip() or '' def filter_dl(self): for dl in self.soup.find_all('dl'): ul = self.soup.new_tag('ul') dts = dl.find_all('dt') dds = dl.find_all('dd') for dt, dd in zip(dts, dds): li = self.soup.new_tag('li') dt.name = 'p' li.append(dt) dd.name = 'p' li.append(dd) ul.append(li) dl.replace_with(ul) @property def is_home_page(self): meta = self.soup.find('meta', attrs={'name': 'homepage'}) return meta is not None and meta.get('value') == 'true' def filter_code(self): for pre in self.soup.find_all('pre'): code_block = self.soup.new_tag('ac:structured-macro') code_block['ac:name'] = 'code' if pre.has_attr('data-lang'): lang_param = self.soup.new_tag('ac:parameter') lang_param['ac:name'] = 'language' lang_param.append(pre['data-lang']) code_block.append(lang_param) plain_text = self.soup.new_tag('ac:plain-text-body') plain_text.append(self.soup.new_string(pre.get_text(), CData)) code_block.append(plain_text) pre.replace_with(code_block) @property def cleaned_src(self): if not self.cleaned_up: self.cleaned_up = True self.filter_image() self.filter_link() self.filter_dl() self.filter_code() body = self.soup.find('body') return (body and body.encode_contents(formatter='html') or self.soup.encode_contents(formatter='html'))
def parse_cms_template(html, cms_context, parent_namespace='', public=False, request=dum_request, template_context=None): """ Refer to tests for cms syntax :param html: Html to be parsed using cms syntax :type html: str :param cms_context: Dictionary that is to be used to parse the cms attributes in template :type cms_context: dict :param parent_namespace: Namespace of the html content to be parsed (if any) :type parent_namespace: str :param public: Renders the page for public usage :type public: bool :param request: Request object to be used for template context :param template_context: Template context to be used for rendering the base and included templates :type template_context: dict :rtype : str """ soup = BeautifulSoup(html, features=HTML_PARSER) for tag in soup.find_all(attrs={INCLUDE_TAG: include_html_re}): namespace = get_namespace(tag, parent_namespace=parent_namespace) include_value = tag.attrs.pop(INCLUDE_TAG) if ':' in include_value: local_namespace, default_template_name = include_value.split(':', 1) else: try: local_namespace = tag.attrs[NAMESPACE_TAG] except KeyError: raise TemplateSyntaxError( 'value of data-cms-include should be of the form ' '{namespace}:{template path}.' 'if namespace is not specified then another attribute ' 'data-cms-namespace should be defined' ) else: if not namespace_re.match(local_namespace): raise TemplateSyntaxError( '"{}" is not a valid value for {}'.format( local_namespace, NAMESPACE_TAG ) ) else: default_template_name = include_value if namespace: namespace += NAMESPACE_DELIMITER + local_namespace else: namespace = local_namespace template_name = cms_context.get(namespace, default_template_name) if template_name.endswith('.html'): template_name = template_name[:-5] try: include_template = validate_and_get_template(template_name) except ValidationError: include_template = validate_and_get_template(default_template_name) include_html = include_template.render(template_context, request) tag.attrs[NAMESPACE_TAG] = local_namespace if not public: tag.attrs[INCLUDE_TAG] = template_name new_tag = Tag(soup, name=tag.name, attrs=tag.attrs) new_tag.insert(0, BeautifulSoup(include_html, features=HTML_PARSER)) tag.replaceWith(new_tag) # soup does not recognize the changes made in above loop unless I do this # Also do not move it inside the loop. It will mess up the variable scoping soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER) for tag in soup.find_all(attrs={ATTR_TAG: attr_re}): _ns = get_namespace(tag, parent_namespace=parent_namespace) attrs = tag[ATTR_TAG].split('|') for attr in attrs: attr_name, key = attr.split(':', 1) key = _ns + NAMESPACE_DELIMITER + key if _ns else key if key in cms_context: tag[attr_name] = cms_context[key] soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER) for tag in soup.find_all(attrs={CONTENT_TAG: content_re}): _ns = get_namespace(tag, parent_namespace=parent_namespace) key = tag[CONTENT_TAG] md = False if key.startswith('md:'): key = key[3:] md = True key = _ns + NAMESPACE_DELIMITER + key if _ns else key if key in cms_context or REPLACE_TAG in tag.attrs: # REPLACE_TAG will be replaced with it's content. # So, it doesn't make much sense to process it in else loop content = cms_context.get(key, '') else: content = tag.encode_contents() if not any(_ in content for _ in CMS_ATTRIBUTES): continue if any(_ in content for _ in CMS_ATTRIBUTES): content = parse_cms_template( html=content, cms_context=cms_context, parent_namespace=key, request=request, template_context=template_context ) if md: content = markdown(content, False) if public and REPLACE_TAG in tag.attrs: new_tag = BeautifulSoup(content, features=HTML_PARSER) else: # We don't replace the tag in auth render so as to keep it editable new_tag = Tag(soup, name=tag.name, attrs=tag.attrs) new_tag.insert(0, BeautifulSoup(content, features=HTML_PARSER)) tag.replace_with(new_tag) soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER) # don't use soup.prettify as it will insert empty spaces inside textarea return soup.encode_contents()
def generate_translation_tuples(self, soup): """ A generator of translation tuples :param soup: BeautifulSoup object :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech) """ # START non-edition-specific # this is the table of content which is present in each edition toc = soup.find('div', id='mw-content-text') page_state = { 'headword': None, 'headword_lang': None, 'part_of_speech': '' } pronounce = '' page_state['headword'] = soup.find('h1', id='firstHeading', class_='firstHeading').text for element in toc.children: if isinstance(element, Tag): # it could be a Tag or a NavigableString level = self.get_heading_level(element.name) # END non-edition-specific # Find the headword language if 'style' in element.attrs and element[ 'style'] == 'background:#EEEEFF': if element.a is not None: page_state['headword_lang'] = element.a.text.replace( 'dili', '').strip() pronounce = '' elif element.a is not None and \ 'title' in element.a.attrs and 'Kateqoriya:Nitq hissələri' in element.a['title']: page_state['part_of_speech'] = element.a.text elif element.name == 'ul': for li in element.find_all('li'): if not isinstance(li, Tag): continue if li.get_text().split(':')[0] == 'Tələffüz': pronounce = li.get_text().split(':')[1].strip() elif element.span is not None: formatted = BeautifulSoup(element.span.text, 'html.parser') formatted = formatted.encode_contents(formatter='html') if b'T\xc9\x99rcüm\xc9\x99l\xc9\x99r :' in formatted: for translation, lang, lang_code in self.parse_translation_table(\ element.find_next_sibling('div', class_='NavFrame')): if translation == '': continue lang = lang.strip() yield (self.edition, page_state['headword'], page_state['headword_lang'], translation, lang, lang_code, page_state['part_of_speech'], pronounce)
def _replace_hrefs(self, in_body): """ The function which replaces hrefs in each body segment. :param in_body: The human-readable form of the message segment. :return: The message segment with all links replaced. """ found_valid_links = False original_a_tags = list() # Stores discovered anchor tags original_hrefs = list() # Stores discovered links message_html_soup = BeautifulSoup(in_body) # Find all 'a' and 'area' tags containing the 'href' property for discovered_href in message_html_soup.findAll(['a', 'area'], href=True): url = discovered_href['href'].strip() if (len(url) == 0 or url.startswith("mailto:") or url.startswith("tel:") or url.startswith("#") or # Allow Google calendar response links ('Yes', 'No', 'Maybe') to not be replaced. # On Android, this will cause a better experience because the Calendar handler # ties directly into Gmail app. # TODO Make a regex to work with international Google TLDs url.startswith("https://www.google.com/calendar/event?action=RESPOND") ): continue # Check this link to see if it's a MailBeaker link. If it is, # we unwrap it. discovered_href['href'] = self._unwrap_mailbeaker_link(url) found_valid_links = True original_a_tags.append(discovered_href) original_hrefs.append(discovered_href['href']) try: replacement_link_ids, replacement_links = self.beaker_client.generate_replacement_links(original_hrefs, self.message_id, self.domain_id, # The following is used to not save a beta user's links, by request. email=self.rcpt_to_address) except Exception as e: # Something failed while attempting to retrieve a new link URL, just go on. logging.exception("Link generation failed.", extra={"email": self.rcpt_to_address, "service_message_id": self.message_id, "links": original_hrefs}) # TODO temp, remove from logging in the near future return in_body, found_valid_links for i, anchor_tag in enumerate(original_a_tags): try: a_title = "Protected by MailBeaker. Original link destination: " + \ self._html_encode(anchor_tag['href']) anchor_tag['title'] = a_title anchor_tag['href'] = replacement_links[i].replace("\"", "") except Exception as e: # Something failed while attempting to replace the link. Warn and proceed. logging.exception("Link replacement failed.", extra={"email": self.rcpt_to_address, "service_message_id": self.message_id}) # Dump the Soup back out to a string in the proper encoding in_body = message_html_soup.encode_contents(encoding='utf-8') # Update the message object's lists for links and link IDs. self.link_urls.extend(original_hrefs) self.link_ids.extend(replacement_link_ids) return in_body, found_valid_links
class WebResource(object): def _is_absolute(self, url): if not url: return False return bool(urlparse(url).scheme) @valid_mime def _is_stylesheet(self): if self._mime_minor() == 'css': return True return False @valid_mime def _is_image(self): if self._mime_major() == 'image': return True return False @valid_mime def _is_generic_mime(self): if self.mime == 'text/plain': return True return False @valid_mime def _mime_major(self): try: return self.mime.split('/')[0] except: return None @valid_mime def _mime_minor(self): try: return self.mime.split('/')[1] except: return None def _recursive_cache_resource(self, url): """Returns: filename => the filename of the cached resource """ if url is None: return None r = WebResource(url, self.base_storage, self.readable, self.user_agent, self.log) r.serialize() return r.filename def getMime(self): resource_mimetype = self.response.info()['Content-Type'] # Taking care of content type with encoding, # ex. 'text/html; charset=UTF-8' resource_mimetype = resource_mimetype.split(';')[0] return resource_mimetype def getFilenameAndExtension(self): resource_extension = None if self.mime: resource_extension = mimetypes.guess_extension(self.mime) if not resource_extension: guessed_mime = mimetypes.guess_type(self.url)[0] if guessed_mime: resource_extension = mimetypes.guess_extension(guessed_mime) # Parsing url and extracting filename and file_ext. # TODOs: # Look if the following os module functions work on windows also, # as windows path separator is '\'. url_parsed = urlparse(self.url) filename, file_ext = splitext(basename(url_parsed.path)) if not resource_extension: resource_extension = file_ext if not resource_extension: self.log.info('Extension could not be guessed for url: %s' % self.url) resource_extension = '.none' return filename, resource_extension """ Parses the html structure using beautiful soup. """ def parseHtml(self): try: self.soup = BeautifulSoup(self.content, "html5lib") except: log.exception('Failed to parse: %s' % self.url) self.soup = None def render_updated_html(self): return _to_unicode(self.soup.encode_contents()) def contents_as_unicode(self): return _to_unicode(self.content) def cache_style_content(self, content, inline=False): """ Caches all required URI's and Imports. Returns, - updated css content """ if inline: sheet = cssutils.parseStyle(content) else: sheet = cssutils.parseString(content, href=self.url) if not inline: for rule in sheet.cssRules: if rule.type == rule.IMPORT_RULE: f = self._recursive_cache_resource(rule.styleSheet.href) rule.href = f def replacer(url): if url.startswith('data'): return url # TODOs: # Check for absolute url before joining return self._recursive_cache_resource(urljoin(self.url, url)) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) return sheet.cssText def serialize(self): if self._is_stylesheet(): self.content = self.cache_style_content(self.content) if self._is_image() or self._is_generic_mime(): f = open(self.base_storage + self.filename, "wb") f.write(self.content) f.close() else: f = codecs.open(self.base_storage + self.filename, "w", "utf-8-sig") f.write(_to_unicode(self.content)) f.close() @parsed def serializeUpdated(self): f = codecs.open(self.base_storage + self.filename, "w", "utf-8-sig") f.write(_to_unicode(self.soup.encode_contents())) f.close() @parsed def update_node_references(self): # Getting the source (src) attribute corrected node_list = self.soup.find_all(src=re.compile('')) for node in node_list: link_attr = node.get('src') if not self._is_absolute(link_attr): node.attrs['src'] = urljoin(self.url, link_attr); # Getting the hyper-reference (href) attribute corrected node_list = self.soup.find_all(href=re.compile('')) for node in node_list: link_attr = node.get('href') if not self._is_absolute(link_attr): node.attrs['href'] = urljoin(self.url, link_attr); self.updated_references = True @parsed @updated_references def cache_resources(self): # Updaing the link tag for link in self.soup.find_all('link', rel=re.compile('stylesheet|icon')): f = self._recursive_cache_resource(link.get('href')) if f is not None: link.attrs['href'] = f # Updating the src tag for tag in self.soup.find_all(src=re.compile('')): f = self._recursive_cache_resource(tag.get('src')) if f is not None: tag.attrs['src'] = f # Looking over the style attribute for tag in self.soup.find_all(style=re.compile('')): css = self.cache_style_content(tag.get('style'), inline=True) if css is not None: tag.attrs['style'] = css # The <style> tag for link in self.soup.find_all('style'): css = self.cache_style_content(link.text) if css is not None: link.string = css def cache(self): self.cache_resources() self.filename = 'index.html' self.serializeUpdated() def __init__(self, url, base_storage='cache/', readable=False, user_agent='Mozilla/5.0', log='websnip.log'): super(WebResource, self).__init__() self.url = url self.readable = readable self.base_storage = base_storage self.user_agent = user_agent self.log = Log(log) # TODOs: # Handle different types URL opening errors like if timeout, then retry. try: self.url_opener = urllib2.build_opener() self.url_opener.addheaders = [('User-agent', self.user_agent)] self.response = self.url_opener.open(self.url) self.content = self.response.read() self.mime = self.getMime() h = hashlib.md5() h.update(self.content) self.hash = h.hexdigest() except: self.response = None self.content = None self.mime = None self.hash = None self.filebase, self.extension = self.getFilenameAndExtension() if self.hash: if self.readable: self.filebase = self.filebase + '-' + self.hash[:8] # First 7 characters of md5 hash else: self.filebase = self.hash self.filename = self.filebase + self.extension self.soup = None self.updated_references = False @deprecated @parsed def updateNodeReferences(self, node, ref): for link in self.soup.find_all(node): link_attr = link.get(ref) if not self._is_absolute(link_attr): link.attrs[ref] = urljoin(self.url, link_attr); @deprecated @parsed def updateReferences(self): self.updateNodeReferences('a', 'href') self.updateNodeReferences('a', 'src') self.updateNodeReferences('link', 'href') self.updateNodeReferences('img', 'src') self.updateNodeReferences('script', 'src') self.updated_references = True @deprecated @parsed @updated_references def cacheNodeReferences(self, node, ref): for link in self.soup.find_all(node): link_attr = link.get(ref) if link_attr: r = WebResource(link_attr, self.base_storage, self.user_agent, self.log) r.serialize() link.attrs[ref] = r.filename @deprecated @parsed @updated_references def cacheReferencedResources(self): self.cacheNodeReferences('link', 'href') self.cacheNodeReferences('img', 'src') self.cacheNodeReferences('script', 'src')
def changetext(num): for i in range(0, num): try: driver.get(tkPage) if i >= 1: driver.find_element_by_tag_name('body').click() driver.find_element_by_tag_name('body').send_keys(Keys.SPACE) # u'\ue00d' time.sleep(5) step2 = wait.until( EC.element_to_be_clickable( (By.ID, 'w0-data-table-grid-row[' + str(i) + ']-w0'))) step2.click() url = driver.current_url # print url print 'ID OF PRODUCT:', re.findall(r'\d{12}$', url) step3 = wait.until( EC.element_to_be_clickable((By.LINK_TEXT, 'HTML'))) step3.click() try: driver.find_element_by_id('v4-22txtEdit_ht') driver.switch_to.frame('v4-22txtEdit_ht') print 'frame:v4-22txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id('v4-20txtEdit_ht') driver.switch_to.frame('v4-20txtEdit_ht') print 'frame:v4-20txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id('v4-46txtEdit_ht') driver.switch_to.frame('v4-46txtEdit_ht') print 'frame:v4-46txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id('v4-43txtEdit_ht') driver.switch_to.frame('v4-43txtEdit_ht') print 'frame:v4-43txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id('v4-26txtEdit_ht') driver.switch_to.frame('v4-26txtEdit_ht') print 'frame:v4-26txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-32txtEdit_ht') driver.switch_to.frame('v4-32txtEdit_ht') print 'frame:v4-32txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-29txtEdit_ht') driver.switch_to.frame( 'v4-29txtEdit_ht') print 'frame:v4-29txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-23txtEdit_ht') driver.switch_to.frame( 'v4-23txtEdit_ht') print 'frame:v4-23txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-28txtEdit_ht') driver.switch_to.frame( 'v4-28txtEdit_ht') print 'frame:v4-28txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-47txtEdit_ht') driver.switch_to.frame( 'v4-47txtEdit_ht') print 'frame:v4-47txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-16txtEdit_ht') driver.switch_to.frame( 'v4-16txtEdit_ht') print 'frame:v4-16txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-25txtEdit_ht' ) driver.switch_to.frame( 'v4-25txtEdit_ht' ) print 'frame:v4-25txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-5txtEdit_ht' ) driver.switch_to.frame( 'v4-5txtEdit_ht' ) print 'frame:v4-5txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-35txtEdit_ht' ) driver.switch_to.frame( 'v4-35txtEdit_ht' ) print 'frame:v4-35txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-19txtEdit_ht' ) driver.switch_to.frame( 'v4-19txtEdit_ht' ) print 'frame:v4-19txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-38txtEdit_ht' ) driver.switch_to.frame( 'v4-38txtEdit_ht' ) print 'frame:v4-38txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-13txtEdit_ht' ) driver.switch_to.frame( 'v4-13txtEdit_ht' ) print 'frame:v4-13txtEdit_ht' except NoSuchElementException: try: driver.find_element_by_id( 'v4-17txtEdit_ht' ) driver.switch_to.frame( 'v4-17txtEdit_ht' ) print 'frame:v4-17txtEdit_ht' except NoSuchElementException: driver.find_element_by_id( 'v4-34txtEdit_ht' ) driver.switch_to.frame( 'v4-34txtEdit_ht' ) print 'frame:v4-34txtEdit_ht' content1 = driver.find_element_by_tag_name('body').text # Added for store,because it used <a> as anchor, but not supported in html5 2017.9.27 content = re.sub(r'<(a|\/a).*?>', "", content1) soup = BeautifulSoup(content, "lxml") links = soup.find_all('a') for link in links: # print type(link) link['target'] = '_blank' print 'link target is all _blank' try: # [link.extract() for link in soup.find_all('a', href=re.compile('xxxxx'))] [ x.parent.extract() for x in soup.findAll('img', { 'src': 'http://www.xxxxxs.com/fpdb/images/Logo_Ny62.jpg' }) ] [ x.parent.extract() for x in soup.findAll('img', { 'src': 'http://www.xxxxxs.com/images/Risk_Free_logo.gif' }) ] [ x.parent.extract() for x in soup.findAll('img', { 'src': 'http://www.xxxxxs.com/images/contact_email.jpg' }) ] # [x.parent.extract() for x in soup.findAll('img', {'src': 'http://www.xxxxxs.com/images/contact_email.jpg'})] except: print 'Category is wrong' # modifiedtxt1 = str(soup) modifiedtxt = soup.encode_contents(formatter='html') ############################################################# copy(modifiedtxt) try: driver.find_element_by_tag_name('body').clear() driver.find_element_by_tag_name('body').send_keys( Keys.CONTROL, 'v') except Exception as e1: print 'bad:', e1 # ############################################################# # # update and save # driver.switch_to.default_content() step4 = wait.until( EC.element_to_be_clickable( (By.CSS_SELECTOR, 'div#actionbar > input'))) step4.click() # print 'step4 ok' time.sleep(10) step5 = wait.until( EC.element_to_be_clickable( (By.CSS_SELECTOR, 'div#confirm_button_wrap > input'))) step5.click() print i, ' is Ok' with open('picturefolder.txt', 'a') as f: f.writelines('Ok') ############################################################ except Exception as e3: print 'error in first, i is', i print e3 time.sleep(3) continue