Ejemplo n.º 1
0
    def do_POST(self):

        content_length = int(self.headers['Content-Length'])
        post_data = self.rfile.read(content_length)

        post_data_xml = BeautifulSoup(post_data, "xml")
        data = None

        logging.debug(
            "POST Request,\nPath: {path}\nHeaders:\n{headers}\n\nBody:\n{body}\n"
            .format(path=self.path,
                    headers=self.headers,
                    body=post_data_xml.encode_contents()))

        soap_action = self.headers['SOAPAction']

        if soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/ClientWebService/GetConfig"':
            # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/b76899b4-ad55-427d-a748-2ecf0829412b
            data = BeautifulSoup(update_handler.get_config_xml, 'xml')

        elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/ClientWebService/GetCookie"':
            # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/36a5d99a-a3ca-439d-bcc5-7325ff6b91e2
            data = BeautifulSoup(update_handler.get_cookie_xml, "xml")

        elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/ClientWebService/SyncUpdates"':
            # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/6b654980-ae63-4b0d-9fae-2abb516af894
            data = BeautifulSoup(update_handler.sync_updates_xml, "xml")

        elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/ClientWebService/GetExtendedUpdateInfo"':
            # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/862adc30-a9be-4ef7-954c-13934d8c1c77
            data = BeautifulSoup(update_handler.get_extended_update_info_xml,
                                 "xml")

        elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/ReportEventBatch"':
            # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/da9f0561-1e57-4886-ad05-57696ec26a78
            data = BeautifulSoup(update_handler.report_event_batch_xml, "xml")

        elif soap_action == '"http://www.microsoft.com/SoftwareDistribution/Server/SimpleAuthWebService/GetAuthorizationCookie"':
            # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-wusp/44767c55-1e41-4589-aa01-b306e0134744
            data = BeautifulSoup(update_handler.get_authorization_cookie_xml,
                                 "xml")

        else:
            logging.warning("SOAP Action not handled")
            logging.info('SOAP Action: {}'.format(soap_action))
            return

        self._set_response()
        self.wfile.write(data.encode_contents())
        logging.info('SOAP Action: {}'.format(soap_action))

        if data is not None:
            logging.debug(
                "POST Response,\nPath: {path}\nHeaders:\n{headers}\n\nBody:\n{body}\n"
                .format(path=self.path,
                        headers=self.headers,
                        body=data.encode_contents))
        else:
            logging.warning("POST Response without data.")
Ejemplo n.º 2
0
    def process_raw_content(raw_content):
        """
        Processes a markdown-formatted string, returning a dict that can be used to populate an Article instance

        :param raw_content: markdown string
        :return: :rtype: dict
        """
        data = {}
        # Since we already have BeautifulSoup in the requirements, it makes sense to leverage it here.
        data['full_rendered_content'] = markdown(raw_content)
        soup = BeautifulSoup(data['full_rendered_content'])
        try:
            data['title'] = soup.find('h1').extract().encode_contents()
        except AttributeError:  # Element not found
            data['title'] = ''
        # Markdown seems to add a paragraph and extra linebreaks inside blockquotes for some reason; in pre_v1 any HTML
        # was skipped, but it was decided to  so we'll do the same here, and we'll remove the linebreaks too.
        try:
            data['punchline'] = soup.find('blockquote').extract().find(
                'p').encode_contents().strip()
        except AttributeError:
            data['punchline'] = ''
        try:
            # Slightly more complex: we need to find the first H2, and extract the first P before it
            data['description'] = soup.find('h2').find_previous(
                'p').extract().encode_contents()
        except AttributeError:
            data['description'] = ''
        data['rendered_html'] = soup.encode_contents()
        return data
Ejemplo n.º 3
0
def process(toroot, html):
  soup = BeautifulSoup(html, 'html.parser')
  try:
    subTitle = soup.find(class_='header').find(class_='subTitle')
    link = soup.new_tag('a', href='package-summary.html')
    link.string = subTitle.encode_contents(formatter='html')
    backIcon = soup.new_tag('i', **{'class':'material-icons'})
    backIcon.string = 'arrow_back'
    link.insert(0, backIcon)
    subTitle.clear()
    subTitle.append(link)
  except:
    pass

  prettyprints = soup.find_all('pre', class_='prettyprint')
  for p in prettyprints:
    p.string = re.sub(r'\s+$', '', p.string, re.M | re.S | re.I)
  soup.head.append(soup.new_tag('link', rel='stylesheet', href='http://fonts.googleapis.com/css?family=Roboto:400,700,300|Roboto+Mono'))
  soup.head.append(soup.new_tag('link', rel='stylesheet', href='https://fonts.googleapis.com/icon?family=Material+Icons'))
  soup.head.append(soup.new_tag('link', rel='stylesheet', href=toroot + 'resources/prettify.css'))
  soup.head.append(soup.new_tag('link', rel='stylesheet', href=toroot + 'resources/javadoc_stylesheet.css'))
  soup.head.append(soup.new_tag('script', src=toroot + 'resources/prettify.js'))
  if soup.body:
    script = soup.new_tag('script')
    script.string = 'prettyPrint();'
    soup.body.append(script)
  return soup.encode_contents(formatter='html')
Ejemplo n.º 4
0
    def process_raw_content(raw_content):
        """
        Processes a markdown-formatted string, returning a dict that can be used to populate an Article instance

        :param raw_content: markdown string
        :return: :rtype: dict
        """
        data = {}
        # Since we already have BeautifulSoup in the requirements, it makes sense to leverage it here.
        data['full_rendered_content'] = markdown(raw_content)
        soup = BeautifulSoup(data['full_rendered_content'])
        try:
            data['title'] = soup.find('h1').extract().encode_contents()
        except AttributeError:  # Element not found
            data['title'] = ''
        # Markdown seems to add a paragraph and extra linebreaks inside blockquotes for some reason; in pre_v1 any HTML
        # was skipped, but it was decided to  so we'll do the same here, and we'll remove the linebreaks too.
        try:
            data['punchline'] = soup.find('blockquote').extract().find('p').encode_contents().strip()
        except AttributeError:
            data['punchline'] = ''
        try:
            # Slightly more complex: we need to find the first H2, and extract the first P before it
            data['description'] = soup.find('h2').find_previous('p').extract().encode_contents()
        except AttributeError:
            data['description'] = ''
        data['rendered_html'] = soup.encode_contents()
        return data
Ejemplo n.º 5
0
def process(toroot, html):
  soup = BeautifulSoup(html, 'html.parser')
  try:
    subTitle = soup.find(class_='header').find(class_='subTitle')
    link = soup.new_tag('a', href='package-summary.html')
    link.string = subTitle.encode_contents(formatter='html')
    backIcon = soup.new_tag('i', **{'class':'material-icons'})
    backIcon.string = 'arrow_back'
    link.insert(0, backIcon)
    subTitle.clear()
    subTitle.append(link)
  except:
    pass

  prettyprints = soup.find_all('pre', class_='prettyprint')
  for p in prettyprints:
    p.string = re.sub(r'\s+$', '', p.string, re.M | re.S | re.I)
  soup.head.append(soup.new_tag('link', rel='stylesheet', href='http://fonts.googleapis.com/css?family=Roboto:400,700,300|Roboto+Mono'))
  soup.head.append(soup.new_tag('link', rel='stylesheet', href='https://fonts.googleapis.com/icon?family=Material+Icons'))
  soup.head.append(soup.new_tag('link', rel='stylesheet', href=toroot + 'resources/prettify.css'))
  soup.head.append(soup.new_tag('link', rel='stylesheet', href=toroot + 'resources/javadoc_stylesheet.css'))
  soup.head.append(soup.new_tag('script', src=toroot + 'resources/prettify.js'))
  if soup.body:
    script = soup.new_tag('script')
    script.string = 'prettyPrint();'
    soup.body.append(script)
  return soup.encode_contents(formatter='html')
Ejemplo n.º 6
0
def convert_text_to_html(input_str):
    # convert newlines to line breaks
    input_str = "<p>" + input_str + "</p>"

    soup = BeautifulSoup(input_str, "html.parser")
    input_str = soup.encode_contents(encoding="utf8").decode("utf8")
    input_str = input_str.replace("\n", "<br/>")

    return clean_html(input_str, strip_unsafe=True)
Ejemplo n.º 7
0
def get_item_info(url):
    wed_data = requests.get(url, headers=header)
    soup = BeautifulSoup(wed_data.text, 'lxml')
    soup.encode_contents(encoding='utf-8')
    no_longer_exitst = '404' in soup.find(
        'script', type='text/javascript').get('src').split('/')
    if no_longer_exitst:
        pass
    else:
        title = soup.title.text
        price = soup.select('span.price.c_f50')[0].text
        print type(price)
        date = soup.select('.time')[0].text
        area = list(
            soup.select('c_25d a')[0].stripped_strings) if soup.find_all(
                'span', 'c_25d') else None
        # item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area})
        print {'title': title, 'price': price, 'date': date, 'area': area}
Ejemplo n.º 8
0
def clean_html(html, strip_unsafe=False):
    """
    clean a html string
    if strip_unsave is set, potentially malicious tags (defined in ``settings.REMOVE_WITH_CONTENT``) are also removed

    :param html: the input HTML string that needs to be cleaned
    :type html: basestring
    :param strip_unsafe:
    :type strip_unsafe: bool
    :return: cleaned html
    :rtype: basestring
    """
    if not html:
        return ""

    doc = BeautifulSoup(html, "html.parser")

    if strip_unsafe:
        for tag in doc.find_all(True):
            if tag.name not in getattr(settings, 'ACCEPTABLE_ELEMENTS', tuple()):
                logger.warning(
                    "Found tag {} which is not in the ACCEPTABLE_ELEMENTS setting".format(tag.name)
                )
                if tag.name in getattr(settings, 'REMOVE_WITH_CONTENT', tuple()):
                    tag.decompose()
                else:
                    tag.unwrap()

            try:
                for attr in tag.attrs.keys():
                    # strip all tags that are not in acceptable attributes
                    if attr not in getattr(settings, 'ACCEPTABLE_ATTRIBUTES', tuple()):
                        logger.warning(
                            "Removing attribute {} of tag {} as it is not listed in the "
                            "ACCEPTABLE_ATTRIBUTES settings".format(attr, tag.name)
                        )
                        del tag[attr]
                        continue

                    # special cases for attributes style and href
                    if attr == 'style':
                        tag[attr] = clean_styles(tag[attr])
                    elif attr == 'href':
                        tag[attr] = clean_hrefs(tag[attr])

            except:
                pass

    # ToDo: Check if we need to be python2 compatible with that
    # doc = unicode(doc)
    # try:
    #     if HTMLField.EMPTY_HTML_REGEXP.match(doc.encode("UTF-8")):
    #         return u""
    # except:
    #     pass
    # encode the result with beautifulsoups html converter, thus "keeping" &nbsp; as &nbsp; (instead of \xa0)
    return doc.encode_contents(formatter='html').decode()
Ejemplo n.º 9
0
def fix_description(description):
	soup = BeautifulSoup(description, 'html.parser')
	match = soup.findAll('script')
	if match:
		for m in match:
			m.decompose()

	match2 = soup.findAll('o:p')
	if match2:
		for m in match2:
			m.decompose()

	comments = soup.findAll(text=lambda text: isinstance(text, Comment))
	[comment.extract() for comment in comments]

	if soup.find('img', {'src': 'http://freeauctiondesigns.com/ebay/templates/green_white_swirls/top.gif'}):
		return soup.encode_contents(formatter='html').decode('utf-8')
	else:
		return BeautifulSoup(border.format(soup.encode_contents(formatter='html')), 'html.parser').encode_contents(formatter='html').decode('utf-8')
Ejemplo n.º 10
0
def download_page(url, target):
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup(html, 'lxml')
    for elem in soup.find_all():
        if elem.get('src', None):
            elem['src'] = relative_to_absolute(url, elem['src'])
        if elem.get('href', None):
            elem['href'] = relative_to_absolute(url, elem['href'])
    with open(target, "w") as f:
        f.write(soup.encode_contents())
Ejemplo n.º 11
0
    def parse_article(content):
        soup = BeautifulSoup(content, 'lxml')

        for tag in soup.find_all():
            if tag.name == 'a' and tag.attrs.get('href') and tag.text.strip():
                tag.attrs = {'href': tag.attrs['href']}
            else:
                tag.unwrap()

        return soup.encode_contents().decode('utf-8').strip()
Ejemplo n.º 12
0
def scrub(file_name, flag):
	soup = BeautifulSoup(open(file_name), "html5lib")
	for node in soup.find_all(class_=flag):
		node.extract()

	new_html = soup.encode_contents(formatter="html")
	with open(file_name, "wb") as file:
		file.write(new_html)

	return
Ejemplo n.º 13
0
def clean_html(html, strip_unsafe=False):
    """
    clean a html string
    if strip_unsave is set, potentially malicious tags (defined in ``settings.REMOVE_WITH_CONTENT``) are also removed

    :param html: the input HTML string that needs to be cleaned
    :type html: basestring
    :param strip_unsafe:
    :type strip_unsafe: bool
    :return: cleaned html
    :rtype: basestring
    """
    if not html:
        return ""

    doc = BeautifulSoup(html, "html.parser")

    if strip_unsafe:
        for tag in doc.find_all(True):
            if tag.name not in getattr(settings, 'ACCEPTABLE_ELEMENTS',
                                       tuple()):
                logger.warning(
                    "Found tag {} which is not in the ACCEPTABLE_ELEMENTS setting"
                    .format(tag.name))
                if tag.name in getattr(settings, 'REMOVE_WITH_CONTENT',
                                       tuple()):
                    tag.decompose()
                else:
                    tag.unwrap()

            try:
                for attr in tag.attrs.keys():
                    # strip all tags that are not in acceptable attributes
                    if attr not in getattr(settings, 'ACCEPTABLE_ATTRIBUTES',
                                           tuple()):
                        logger.warning(
                            "Removing attribute {} of tag {} as it is not listed in the "
                            "ACCEPTABLE_ATTRIBUTES settings".format(
                                attr, tag.name))
                        del tag[attr]
                        continue

                    # special cases for attributes style and href
                    if attr == 'style':
                        tag[attr] = clean_styles(tag[attr])
                    elif attr == 'href':
                        tag[attr] = clean_hrefs(tag[attr])

            except:
                pass

    return doc.encode_contents(formatter='html').decode()
 def _strip_tags(self, html, invalid_tags=['em', 'a', 'span', 'strong', 'div', 'p']):
     soup = BeautifulSoup(html, "html.parser")
     for tag in soup.find_all(True):
         if tag.name in invalid_tags:
             s = ""
             for c in tag.contents:
                 if not isinstance(c, NavigableString):
                     c = self._strip_tags(unicode(c), invalid_tags)
                     s += unicode(c).strip()
                 else:
                     s += unicode(c)
             tag.replace_with(s)
     return soup.encode_contents().decode('UTF-8')
 def _strip_tags(self, html, invalid_tags=['em', 'a', 'span', 'strong', 'div', 'p']):
     soup = BeautifulSoup(html, "html.parser")
     for tag in soup.find_all(True):
         if tag.name in invalid_tags:
             s = ""
             for c in tag.contents:
                 if not isinstance(c, NavigableString):
                     c = self._strip_tags(unicode(c), invalid_tags)
                     s += unicode(c).strip()
                 else:
                     s += unicode(c)
             tag.replace_with(s)
     return soup.encode_contents().decode('UTF-8')
Ejemplo n.º 16
0
    def _scrapeHomeAndGetLinks(self,home):
        soup_home=BeautifulSoup(home,'html.parser')
        soup_home.encode_contents(encoding='utf-8')

        #get the 1/3 column slices containing the major url links we want
        selectionColumns=[s for s in map(lambda t: t.encode('utf-8'),soup_home.body.find_all('div',{'class': 'one-third'}))]

        #grab the major category of link from each 1/3 column slice
        def _getHomeHyperLinks(col):
            greaterCat=BeautifulSoup(col,'html.parser').find_all('span')

            catAnchor=[]
            for cat in greaterCat:
                catAnchor.extend([anchor['href'] for anchor in cat.find_all('a')])

            return catAnchor

        list_of_urls=(_getHomeHyperLinks(col) for col in selectionColumns)

        #flatten each list of columns of url paths
        url_path=functools.reduce(lambda acc,list:acc+list,list_of_urls,[])

        return url_path
Ejemplo n.º 17
0
def replace_cid_in_html(html, mapped_attachments):
    if html is None:
        return None

    soup = BeautifulSoup(html)

    inline_images = soup.findAll('img', {'src': lambda src: src and src.startswith('cid:')})

    for image in inline_images:
        inline_attachment = mapped_attachments.get(image.get('src')[4:])
        if inline_attachment is not None:
            image['src'] = reverse('email_attachment_proxy_view', kwargs={'pk': inline_attachment.pk})

    return soup.encode_contents()
Ejemplo n.º 18
0
def replace_anchors_in_html(html):
    """
    Make all anchors open outside the iframe
    """
    if html is None:
        return None

    soup = BeautifulSoup(html)

    for anchor in soup.findAll('a'):
        anchor.attrs.update({
            'target': '_blank',
        })

    return soup.encode_contents()
Ejemplo n.º 19
0
def convert_text_to_html(input_str):
    """
    Converts a textfield (without html) to an html field
    This is useful for migrations or management commands where you need to manually convert the content of a field

    :param input_str:
    :return:
    """
    # convert newlines to line breaks
    input_str = "<p>" + input_str + "</p>"

    soup = BeautifulSoup(input_str, "html.parser")
    input_str = soup.encode_contents(encoding="utf8").decode("utf8")
    input_str = input_str.replace(u"\n", u"<br/>")

    return clean_html(input_str, strip_unsafe=True)
Ejemplo n.º 20
0
def process(content, used=False):
    if not content:
        return

    soup = BeautifulSoup(content.encode('utf8'), from_encoding='utf8')
    for tag in soup.find_all('a'):
        href = tag.attrs.get('href', '')

        if not href.startswith('http'):
            continue

        if 'img-fotki' in href or not used:
            tag.attrs['rel'] = 'nofollow'

    content = soup.encode_contents(indent_level=2).decode('utf8')\
        .replace('<html><body>', '').replace('</body></html>', '')

    return content
Ejemplo n.º 21
0
def edit_page(filename):
    original_page = open(filename, 'r').read()
    soup = BeautifulSoup(original_page, 'lxml')
    forms = soup.find_all('form')
    print "[*] Found forms:"
    i = 0
    for f in forms:
        print "FORM " + str(i) + " --> " + f.get('action', 'None')
        i += 1
    while True:
        try:
            i = int(raw_input('Form to log: '))
        except ValueError:
            print "Enter the form number"
        try:
            f = forms[i]
            break
        except IndexError:
            print "Invalid form number"
    print "Selected form " + str(i) + '\n'
    f['action'] = "/form"
    loggable = []
    for i in f.find_all('input'):
        if i.get('name'):
            loggable.append(i['name'])
    while True:
        print "[*] Form fields:"
        for i in range(len(loggable)):
            print str(i) + " - " + loggable[i]
        input_params = raw_input(
            'Fields to log (comma separated, e.g 1,4,5): ').split(',')
        to_log = []
        try:
            for i in input_params:
                to_log.append(loggable[int(i)])
            break
        except:
            print "Invalid format: use form field identifiers (e.g 1,4,5)"
    print 'Logging: ' + str(to_log) + '\n'
    with open('index.html', "w") as f:
        f.write(soup.encode_contents())
    return to_log
Ejemplo n.º 22
0
    def transform_links(self, url, content):

        parse_result = urlparse.urlparse(url)
        soup = BeautifulSoup(content)

        for link in soup.find_all(True):

                href = None
                for name in ['href', 'src']:
                    if name in link.attrs: 
                        href = link.attrs[name]
                        break

                if href is None:
                    continue

                new_href = self.get_absolute_url(parse_result.path, link.attrs[name])
                if new_href is not None:
                    link.attrs[name] = new_href

        return soup.encode_contents(formatter='html')
Ejemplo n.º 23
0
    def parse_page(self):
        page = self.read_url(self.original_url)

        soup = BeautifulSoup(page, "lxml")

        css = soup.find_all('link', {'rel': 'stylesheet'})
        js = soup.find_all('script')
        images = soup.find_all('img')
        styles = soup.find_all('style')
        inline_styles = soup.find_all(
            attrs={
                'style':
                re.compile(
                    "(?:background|background-image):(?:[ ]+|)(?:[\#\w\d]*|)(?:[ ]+|)url\((.*?)\)"
                )
            })

        formsDetection = FormsDetection(soup, self.url)
        formsDetection.replace()

        for i in images:
            if i.get('src'):
                i['src'] = self.parse_image(i['src'])
        for j in js:
            if j.get('src'):
                j['src'] = self.parse_javascript(j['src'])
                j['type'] = 'text/javascript'
        for _c in css:
            if _c.get('href'):
                _c['href'] = self.parse_css(_c['href'])
                _c['type'] = 'text/css'
        for s in styles:
            s.string = self.parse_css_text(s.string)
        for _is in inline_styles:
            _is.attrs['style'] = self.parse_css_text(_is.attrs['style'])

        return self.write_file(soup.encode_contents(), 'throwaway_dirname',
                               'html', 'w')
Ejemplo n.º 24
0
def metadata_for_papers(paper_ids, outfile):
    fout = open(outfile, "w")

    fout.write("paper_id{0}pubmed_id{0}author_ids\n".format(DELIM))

    for i, paper_id in enumerate(paper_ids):
        # Counter
        if i % 5 == 0:
            print i

        # Execute query, convert result to Soup format
        #try:
        res = requests.get(SCOPUS_QUERY.format(paper_id))
        soup = BeautifulSoup(res.content)

        # get author list
        authors_section = soup.find(id="authorlist").encode_contents()
        author_list = re.findall(r"\?authorId=(.*?)\&", authors_section)
        authors_str = DELIM.join(author_list)

        # get pubmed ID
        try:
            # assume there's only one
            pubmed_id = re.findall(r"\"View in PubMed\">(.*?)<",
                                   soup.encode_contents())[0]
        except IndexError:
            # no pubmed ID
            print i, ":no pubmed ID"
            pubmed_id = ""

        # write to file
        fout.write("{1}{0}{2}{0}{3}\n".format(DELIM, paper_id, pubmed_id,
                                              authors_str))
        #except:
        #	# Can't find ID or something went wrong
        #	fout.write("{0}\n".format(paper_id))

    fout.close()
Ejemplo n.º 25
0
def render_element(e_id, e_type):
	client = MongoClient('localhost', 27017)
	widget = client.asktask.q_builder_widgets.find_one({'id':e_type})
	code_js = ''
	code_block = BeautifulSoup('<div class="quest_element" id="{}"></div>'.format(e_id))
	code_block.div.append(code_block.new_tag('h3'))
	code_block.div.h3.append(code_block.new_tag('i', **{'class':'fa fa-{}'.format(widget['icon'])}))
	code_block.div.h3.append('&nbsp;'+widget['title'])
	code_block.div.append(code_block.new_tag('div'))
	#code_block.div.div.append(code_block.new_tag('p'))
	#code_block.div.div.p.append(widget['description'])
	if 'form' in widget:
		form = code_block.new_tag('form', **{'class':'widget_settings pure-form'})
		for field in widget['form']:
			if field['type'] == 'textarea':
				form.append(code_block.new_tag('textarea', **{'placeholder':field['name']}))
			elif field['type'] == 'select':
				sel_el = code_block.new_tag('select')
				for opt in field['options']:
					o = code_block.new_tag('option')
					o.append(opt)
					sel_el.append(o)
				form.append(sel_el)
			elif field['type'] == 'spinner':
				form.append(code_block.new_tag('input', **{'class':'spinner','id':'{}-{}'.format(e_id, field['id']), 'value':field['value']}))
				js_opt = []
				if 'min' in field.keys():
					js_opt.append('min: {}'.format(field['min']))
				if 'max' in field.keys():
					js_opt.append('max: {}'.format(field['max']))
				code_js += '$( "#{}-{}" ).spinner({{\n'.format(e_id, field['id'])
				code_js += ',\n'.join(js_opt)
				code_js += '});\n'
			else:
				field['id'] = '{}-{}'.format(e_id, field['id'])
				form.append(code_block.new_tag('input', **{k:v for k,v in field.items()}))
		code_block.div.div.append(form)
	return code_block.encode_contents(formatter=None), code_js
Ejemplo n.º 26
0
def metadata_for_papers(paper_ids, outfile):
	fout = open(outfile,"w") 

	fout.write("paper_id{0}pubmed_id{0}author_ids\n".format(DELIM))

	for i, paper_id in enumerate(paper_ids):
		# Counter
		if i % 5 == 0:
			print i

		# Execute query, convert result to Soup format
		#try:
		res = requests.get(SCOPUS_QUERY.format(paper_id))
		soup = BeautifulSoup(res.content)
		
		# get author list
		authors_section = soup.find(id="authorlist").encode_contents()
		author_list = re.findall(r"\?authorId=(.*?)\&", authors_section)
		authors_str = DELIM.join(author_list)

		# get pubmed ID
		try:
			# assume there's only one
			pubmed_id = re.findall(r"\"View in PubMed\">(.*?)<", soup.encode_contents())[0]
		except IndexError:
			# no pubmed ID
			print i, ":no pubmed ID"
			pubmed_id = ""

		# write to file
		fout.write("{1}{0}{2}{0}{3}\n".format(DELIM, paper_id, pubmed_id, authors_str ))
		#except:
		#	# Can't find ID or something went wrong
		#	fout.write("{0}\n".format(paper_id))

	fout.close()
Ejemplo n.º 27
0
def parse_cms_template(html, cms_context, parent_namespace='', public=False,
                       request=dum_request, template_context=None, using=None):
    """
    Refer to tests for cms syntax

    :param html: Html to be parsed using cms syntax
    :type html: str
    :param cms_context: Dictionary that is to be used to parse the
    cms attributes in template
    :type cms_context: dict
    :param parent_namespace: Namespace of the html content to be parsed (if any)
    :type parent_namespace: str
    :param public: Renders the page for public usage
    :type public: bool
    :param request: Request object to be used for template context
    :param template_context: Template context to be used for rendering the
    base and included templates
    :type template_context: dict
    :param using: Template engine used to render the final template
    :rtype : str
    """
    soup = BeautifulSoup(html, features=HTML_PARSER)
    for tag in soup.find_all(attrs={INCLUDE_TAG: include_html_re}):
        namespace = get_namespace(tag, parent_namespace=parent_namespace)
        include_value = tag.attrs.pop(INCLUDE_TAG)
        if ':' in include_value:
            local_namespace, default_template_name = include_value.split(':', 1)
        else:
            try:
                local_namespace = tag.attrs[NAMESPACE_TAG]
            except KeyError:
                raise TemplateSyntaxError(
                    'value of data-cms-include should be of the form '
                    '{namespace}:{template path}.'
                    'if namespace is not specified then another attribute '
                    'data-cms-namespace should be defined'
                )
            else:
                if not namespace_re.match(local_namespace):
                    raise TemplateSyntaxError(
                        '"{}" is not a valid value for {}'.format(
                            local_namespace, NAMESPACE_TAG
                        )
                    )
                else:
                    default_template_name = include_value

        if namespace:
            namespace += NAMESPACE_DELIMITER + local_namespace
        else:
            namespace = local_namespace

        template_name = cms_context.get(namespace, default_template_name)

        if template_name.endswith('.html'):
            template_name = template_name[:-5]

        try:
            include_template = validate_and_get_template(
                name=template_name, using=using
            )
        except ValidationError:
            include_template = validate_and_get_template(
                name=default_template_name, using=using
            )

        include_html = include_template.render(template_context, request)

        tag.attrs[NAMESPACE_TAG] = local_namespace
        if not public:
            tag.attrs[INCLUDE_TAG] = template_name

        replace_tag_content(tag=tag, content=include_html)

    for tag in soup.find_all(attrs={ATTR_TAG: attr_re}):
        _ns = get_namespace(tag, parent_namespace=parent_namespace)
        attrs = tag[ATTR_TAG].split('|')

        for attr in attrs:
            attr_name, key = attr.split(':', 1)
            key = _ns + NAMESPACE_DELIMITER + key if _ns else key

            if key in cms_context:
                tag[attr_name] = render_template_string(
                    template_string=cms_context[key],
                    context=template_context,
                    request=request, using=using
                )

    for tag in soup.find_all(attrs={CONTENT_TAG: content_re}):
        _ns = get_namespace(tag, parent_namespace=parent_namespace)
        key = tag[CONTENT_TAG]
        md = False

        if key.startswith('md:'):
            key = key[3:]
            md = True

        key = _ns + NAMESPACE_DELIMITER + key if _ns else key

        if key in cms_context or REPLACE_TAG in tag.attrs:
            # REPLACE_TAG will be replaced with it's content.
            # So, it doesn't make much sense to process it in else loop
            content = cms_context.get(key, '')
        else:
            content = tag.encode_contents()
            if not any(attr in content for attr in CMS_ATTRIBUTES):
                continue

        if any(attr in content for attr in CMS_ATTRIBUTES):
            content = parse_cms_template(
                html=content, cms_context=cms_context, parent_namespace=key,
                request=request, template_context=template_context, using=using
            )

        if md:
            content = markdown(content, escape=False)

        content = render_template_string(
            template_string=content,
            context=template_context,
            request=request, using=using
        )

        if public and REPLACE_TAG in tag.attrs:
            new_tag = BeautifulSoup(content, features=HTML_PARSER)
            tag.replace_with(new_tag)
        else:
            # We don't replace the tag in auth render so as to keep it editable
            replace_tag_content(tag=tag, content=content)

    # don't use soup.prettify as it will insert empty spaces inside textarea
    return soup.encode_contents()
Ejemplo n.º 28
0
def parse_cms_template(html, dictionary, parent_namespace='', publish=False, request=dum_request):
    """
    Refer to tests for cms syntax

    :param html: Html to be parsed using cms syntax
    :type html: str
    :param dictionary: Dictionary that is to be used to parse the cms attributes in template
    :type dictionary: dict
    :param parent_namespace: Namespace of the html content to be parsed (if any)
    :type parent_namespace: str
    :param publish: This will hide sensitive info while rendering for public usage
    :type publish: bool
    :rtype : str
    """
    soup = BeautifulSoup(html, features=HTML_PARSER)

    for tag in soup.find_all(attrs={'data-cms-include': include_html_re}):
        namespace = get_namespace(tag, parent_namespace=parent_namespace)
        include_value = tag.attrs.pop('data-cms-include')
        if ':' in include_value:
            local_namespace, default_template_name = include_value.split(':', 1)
        else:
            try:
                local_namespace = tag.attrs['data-cms-namespace']
            except KeyError:
                raise TemplateSyntaxError(
                    'value of data-cms-include should be of the form {namespace}:{template path}'
                    'if namespace is not specified then another attribute data-cms-namespace should be defined'
                )
            else:
                if not namespace_re.match(local_namespace):
                    raise TemplateSyntaxError(
                        '"{}" is not a valid value for data-cms-namespace'.format(local_namespace)
                    )
                else:
                    default_template_name = include_value

        namespace += NAMESPACE_DELIMITER + local_namespace if namespace else local_namespace

        template_name = dictionary.get(namespace, default_template_name)

        if template_name.endswith('.html'):
            template_name = template_name[:-5]

        try:
            include_template = validate_and_get_template(template_name)
        except ValidationError:
            include_template = validate_and_get_template(
                default_template_name[:-5] if default_template_name.endswith('.html') else default_template_name
            )

        include_html = include_template.render(request=request)

        tag.attrs['data-cms-namespace'] = local_namespace
        if not publish:
            tag.attrs['data-cms-include'] = template_name

        new_tag = Tag(soup, name=tag.name, attrs=tag.attrs)
        new_tag.insert(0, BeautifulSoup(include_html, features=HTML_PARSER))
        tag.replaceWith(new_tag)

    # soup does not recognize the changes made in above loop unless I do this
    # Also do not move it inside the loop. It will mess up the variable scoping
    soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER)

    for tag in soup.find_all(attrs={'data-cms-attr': attr_re}):
        _ns = get_namespace(tag, parent_namespace=parent_namespace)
        attrs = tag['data-cms-attr'].split('|')

        for attr in attrs:
            attr_name, key = attr.split(':', 1)
            key = _ns + NAMESPACE_DELIMITER + key if _ns else key

            if key in dictionary:
                tag[attr_name] = dictionary[key]

    soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER)

    for tag in soup.find_all(attrs={'data-cms-content': content_re}):
        _ns = get_namespace(tag, parent_namespace=parent_namespace)
        key = tag['data-cms-content']
        md = False

        if key.startswith('md:'):
            key = key[3:]
            md = True

        key = _ns + NAMESPACE_DELIMITER + key if _ns else key

        if key in dictionary:
            content = dictionary[key]
        else:
            content = tag.encode_contents()
            if not any(_ in content for _ in CMS_ATTRIBUTES):
                continue

        new_tag = Tag(soup, name=tag.name, attrs=tag.attrs)
        if any(_ in content for _ in CMS_ATTRIBUTES):
            content = parse_cms_template(content, dictionary, parent_namespace=key, request=request)

        if md:
            content = markdown(content, False)

        new_tag.insert(0, BeautifulSoup(content, features=HTML_PARSER))
        tag.replace_with(new_tag)

    soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER)
    # don't use soup.prettify as it will insert empty spaces inside textarea
    return soup.encode_contents()
Ejemplo n.º 29
0
 def clean_source(self, file):
     soup = BeautifulSoup(file.read())
     for tag in self.clean_tags:
         for item in soup.find_all(tag):
             item.extract()
     return soup.encode_contents()
Ejemplo n.º 30
0
class ConfluencePageInflater(object):
    def __init__(self, page_source, page_handle, attach_handle,
                 encoding='utf-8'):
        super(ConfluencePageInflater, self).__init__()
        self.soup = BeautifulSoup(page_source, 'html5lib',
                                  from_encoding=encoding)
        self.page_handle = page_handle
        self.attach_handle = attach_handle
        self.cleaned_up = False

    def filter_image(self):
        for img in self.soup.find_all('img'):
            ac_image = self.soup.new_tag('ac:image')
            src = img.get('src')
            if src and '//' not in src:
                attach = self.attach_handle(src, img.get('title'))
                if attach:
                    ri_resource = self.soup.new_tag('ri:attachment')
                    ri_resource['ri:filename'] = attach['resource_name']
                else:
                    img.decompose()
                    continue
            else:
                ri_resource = self.soup.new_tag('ri:url')
                ri_resource['ri:value'] = src
            ac_image.append(ri_resource)
            if img.has_attr('alt'):
                ac_image['ac:alt'] = img['alt']
            img.replace_with(ac_image)

    def filter_link(self):
        for link in self.soup.find_all('a'):
            href = link.get('href')
            if href and '//' not in href:
                if '?' in href:
                    href = href[:href.index('?')]
                ac_link = self.soup.new_tag('ac:link')
                if '#' in href:
                    ac_link['ac:anchor'] = href[href.index('#') + 1:]
                    href = href[:href.index('#')]
                if href.endswith('.html'):
                    page = self.page_handle(href)
                    if page:
                        ri_resource = self.soup.new_tag('ri:page')
                        ri_resource['ri:content-title'] = page['title']
                    else:
                        link.decompose()
                        continue
                else:
                    attach = self.attach_handle(href, link.get('title'))
                    if attach:
                        ri_resource = self.soup.new_tag('ri:attachment')
                        ri_resource['ri:filename'] = attach['resource_name']
                    else:
                        link.decompose()
                        continue
                ac_link.append(ri_resource)
                children = link.find_all()
                if children:
                    body = self.soup.new_tag('ac:link-body')
                    for child in children:
                        body.append(child)
                elif link.text:
                    body = self.soup.new_tag('ac:plain-text-link-body')
                    body.append(self.soup.new_string(link.text, CData))
                else:
                    link.decompose()
                    continue
                if link.has_attr('title'):
                    ac_link['ac:title'] = link['title']
                ac_link.append(body)
                link.replaceWith(ac_link)

    @property
    def title(self):
        title = self.soup.find('title')
        return title and title.encode_contents().strip() or ''

    def filter_dl(self):
        for dl in self.soup.find_all('dl'):
            ul = self.soup.new_tag('ul')
            dts = dl.find_all('dt')
            dds = dl.find_all('dd')
            for dt, dd in zip(dts, dds):
                li = self.soup.new_tag('li')
                dt.name = 'p'
                li.append(dt)
                dd.name = 'p'
                li.append(dd)
                ul.append(li)
            dl.replace_with(ul)

    @property
    def is_home_page(self):
        meta = self.soup.find('meta', attrs={'name': 'homepage'})
        return meta is not None and meta.get('value') == 'true'

    def filter_code(self):
        for pre in self.soup.find_all('pre'):
            code_block = self.soup.new_tag('ac:structured-macro')
            code_block['ac:name'] = 'code'

            if pre.has_attr('data-lang'):
                lang_param = self.soup.new_tag('ac:parameter')
                lang_param['ac:name'] = 'language'
                lang_param.append(pre['data-lang'])
                code_block.append(lang_param)

            plain_text = self.soup.new_tag('ac:plain-text-body')
            plain_text.append(self.soup.new_string(pre.get_text(), CData))
            code_block.append(plain_text)
            pre.replace_with(code_block)

    @property
    def cleaned_src(self):
        if not self.cleaned_up:
            self.cleaned_up = True
            self.filter_image()
            self.filter_link()
            self.filter_dl()
            self.filter_code()
        body = self.soup.find('body')
        return (body and body.encode_contents(formatter='html') or
                self.soup.encode_contents(formatter='html'))
Ejemplo n.º 31
0
def parse_cms_template(html, cms_context, parent_namespace='', public=False,
                       request=dum_request, template_context=None):
    """
    Refer to tests for cms syntax

    :param html: Html to be parsed using cms syntax
    :type html: str
    :param cms_context: Dictionary that is to be used to parse the
    cms attributes in template
    :type cms_context: dict
    :param parent_namespace: Namespace of the html content to be parsed (if any)
    :type parent_namespace: str
    :param public: Renders the page for public usage
    :type public: bool
    :param request: Request object to be used for template context
    :param template_context: Template context to be used for rendering the
    base and included templates
    :type template_context: dict
    :rtype : str
    """
    soup = BeautifulSoup(html, features=HTML_PARSER)

    for tag in soup.find_all(attrs={INCLUDE_TAG: include_html_re}):
        namespace = get_namespace(tag, parent_namespace=parent_namespace)
        include_value = tag.attrs.pop(INCLUDE_TAG)
        if ':' in include_value:
            local_namespace, default_template_name = include_value.split(':', 1)
        else:
            try:
                local_namespace = tag.attrs[NAMESPACE_TAG]
            except KeyError:
                raise TemplateSyntaxError(
                    'value of data-cms-include should be of the form '
                    '{namespace}:{template path}.'
                    'if namespace is not specified then another attribute '
                    'data-cms-namespace should be defined'
                )
            else:
                if not namespace_re.match(local_namespace):
                    raise TemplateSyntaxError(
                        '"{}" is not a valid value for {}'.format(
                            local_namespace, NAMESPACE_TAG
                        )
                    )
                else:
                    default_template_name = include_value

        if namespace:
            namespace += NAMESPACE_DELIMITER + local_namespace
        else:
            namespace = local_namespace

        template_name = cms_context.get(namespace, default_template_name)

        if template_name.endswith('.html'):
            template_name = template_name[:-5]

        try:
            include_template = validate_and_get_template(template_name)
        except ValidationError:
            include_template = validate_and_get_template(default_template_name)

        include_html = include_template.render(template_context, request)

        tag.attrs[NAMESPACE_TAG] = local_namespace
        if not public:
            tag.attrs[INCLUDE_TAG] = template_name

        new_tag = Tag(soup, name=tag.name, attrs=tag.attrs)
        new_tag.insert(0, BeautifulSoup(include_html, features=HTML_PARSER))
        tag.replaceWith(new_tag)

    # soup does not recognize the changes made in above loop unless I do this
    # Also do not move it inside the loop. It will mess up the variable scoping
    soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER)

    for tag in soup.find_all(attrs={ATTR_TAG: attr_re}):
        _ns = get_namespace(tag, parent_namespace=parent_namespace)
        attrs = tag[ATTR_TAG].split('|')

        for attr in attrs:
            attr_name, key = attr.split(':', 1)
            key = _ns + NAMESPACE_DELIMITER + key if _ns else key

            if key in cms_context:
                tag[attr_name] = cms_context[key]

    soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER)

    for tag in soup.find_all(attrs={CONTENT_TAG: content_re}):
        _ns = get_namespace(tag, parent_namespace=parent_namespace)
        key = tag[CONTENT_TAG]
        md = False

        if key.startswith('md:'):
            key = key[3:]
            md = True

        key = _ns + NAMESPACE_DELIMITER + key if _ns else key

        if key in cms_context or REPLACE_TAG in tag.attrs:
            # REPLACE_TAG will be replaced with it's content.
            # So, it doesn't make much sense to process it in else loop
            content = cms_context.get(key, '')
        else:
            content = tag.encode_contents()
            if not any(_ in content for _ in CMS_ATTRIBUTES):
                continue

        if any(_ in content for _ in CMS_ATTRIBUTES):
            content = parse_cms_template(
                html=content, cms_context=cms_context, parent_namespace=key,
                request=request, template_context=template_context
            )

        if md:
            content = markdown(content, False)

        if public and REPLACE_TAG in tag.attrs:
            new_tag = BeautifulSoup(content, features=HTML_PARSER)
        else:
            # We don't replace the tag in auth render so as to keep it editable
            new_tag = Tag(soup, name=tag.name, attrs=tag.attrs)
            new_tag.insert(0, BeautifulSoup(content, features=HTML_PARSER))

        tag.replace_with(new_tag)

    soup = BeautifulSoup(soup.encode_contents(), features=HTML_PARSER)
    # don't use soup.prettify as it will insert empty spaces inside textarea
    return soup.encode_contents()
Ejemplo n.º 32
0
    def generate_translation_tuples(self, soup):
        """
        A generator of translation tuples
        :param soup: BeautifulSoup object
        :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech)
        """

        # START non-edition-specific
        # this is the table of content which is present in each edition
        toc = soup.find('div', id='mw-content-text')

        page_state = {
            'headword': None,
            'headword_lang': None,
            'part_of_speech': ''
        }

        pronounce = ''

        page_state['headword'] = soup.find('h1',
                                           id='firstHeading',
                                           class_='firstHeading').text

        for element in toc.children:
            if isinstance(element,
                          Tag):  # it could be a Tag or a NavigableString
                level = self.get_heading_level(element.name)
                # END non-edition-specific
                # Find the headword language

                if 'style' in element.attrs and element[
                        'style'] == 'background:#EEEEFF':

                    if element.a is not None:

                        page_state['headword_lang'] = element.a.text.replace(
                            'dili', '').strip()
                        pronounce = ''

                elif element.a is not None and \
                    'title' in element.a.attrs and 'Kateqoriya:Nitq hissələri' in element.a['title']:

                    page_state['part_of_speech'] = element.a.text

                elif element.name == 'ul':

                    for li in element.find_all('li'):
                        if not isinstance(li, Tag):
                            continue
                        if li.get_text().split(':')[0] == 'Tələffüz':
                            pronounce = li.get_text().split(':')[1].strip()

                elif element.span is not None:

                    formatted = BeautifulSoup(element.span.text, 'html.parser')

                    formatted = formatted.encode_contents(formatter='html')

                    if b'T\xc9\x99rc&uuml;m\xc9\x99l\xc9\x99r&nbsp;:' in formatted:

                        for translation, lang, lang_code in self.parse_translation_table(\
                            element.find_next_sibling('div', class_='NavFrame')):

                            if translation == '':
                                continue
                            lang = lang.strip()
                            yield (self.edition, page_state['headword'],
                                   page_state['headword_lang'], translation,
                                   lang, lang_code,
                                   page_state['part_of_speech'], pronounce)
Ejemplo n.º 33
0
    def _replace_hrefs(self, in_body):
        """
        The function which replaces hrefs in each body segment.

        :param in_body: The human-readable form of the message segment.
        :return: The message segment with all links replaced.
        """

        found_valid_links = False
        original_a_tags = list()  # Stores discovered anchor tags
        original_hrefs = list()  # Stores discovered links

        message_html_soup = BeautifulSoup(in_body)

        # Find all 'a' and 'area' tags containing the 'href' property
        for discovered_href in message_html_soup.findAll(['a', 'area'], href=True):
            url = discovered_href['href'].strip()
            if (len(url) == 0 or
                url.startswith("mailto:") or
                url.startswith("tel:") or
                url.startswith("#") or
                # Allow Google calendar response links ('Yes', 'No', 'Maybe') to not be replaced.
                # On Android, this will cause a better experience because the Calendar handler
                # ties directly into Gmail app.
                # TODO Make a regex to work with international Google TLDs
                url.startswith("https://www.google.com/calendar/event?action=RESPOND")
            ):
                continue

            # Check this link to see if it's a MailBeaker link. If it is,
            # we unwrap it.
            discovered_href['href'] = self._unwrap_mailbeaker_link(url)

            found_valid_links = True
            original_a_tags.append(discovered_href)
            original_hrefs.append(discovered_href['href'])

        try:
            replacement_link_ids, replacement_links = self.beaker_client.generate_replacement_links(original_hrefs,
                                                                                               self.message_id,
                                                                                               self.domain_id,
                                                                                               # The following is used to not save a beta user's links, by request.
                                                                                               email=self.rcpt_to_address)
        except Exception as e:
            # Something failed while attempting to retrieve a new link URL, just go on.
            logging.exception("Link generation failed.",
                              extra={"email": self.rcpt_to_address,
                                     "service_message_id": self.message_id,
                                     "links": original_hrefs})  # TODO temp, remove from logging in the near future
            return in_body, found_valid_links

        for i, anchor_tag in enumerate(original_a_tags):
            try:
                a_title = "Protected by MailBeaker.  Original link destination: " + \
                          self._html_encode(anchor_tag['href'])

                anchor_tag['title'] = a_title
                anchor_tag['href'] = replacement_links[i].replace("\"", "")
            except Exception as e:
                # Something failed while attempting to replace the link. Warn and proceed.
                logging.exception("Link replacement failed.",
                                  extra={"email": self.rcpt_to_address,
                                         "service_message_id": self.message_id})

        # Dump the Soup back out to a string in the proper encoding
        in_body = message_html_soup.encode_contents(encoding='utf-8')

        # Update the message object's lists for links and link IDs.
        self.link_urls.extend(original_hrefs)
        self.link_ids.extend(replacement_link_ids)

        return in_body, found_valid_links
Ejemplo n.º 34
0
class WebResource(object):
	def _is_absolute(self, url):
		if not url:
			return False
		return bool(urlparse(url).scheme)

	@valid_mime
	def _is_stylesheet(self):
		if self._mime_minor() == 'css':
			return True
		return False

	@valid_mime
	def _is_image(self):
		if self._mime_major() == 'image':
			return True
		return False

	@valid_mime
	def _is_generic_mime(self):
		if self.mime == 'text/plain':
			return True
		return False

	@valid_mime
	def _mime_major(self):
		try:
			return self.mime.split('/')[0]
		except:
			return None

	@valid_mime
	def _mime_minor(self):
		try:
			return self.mime.split('/')[1]
		except:
			return None

	def _recursive_cache_resource(self, url):
		"""Returns:
		filename => the filename of the cached resource
		"""
		if url is None:
			return None
		r = WebResource(url, self.base_storage, self.readable, self.user_agent, self.log)
		r.serialize()
		return r.filename

	def getMime(self):
		resource_mimetype = self.response.info()['Content-Type']
		# Taking care of content type with encoding,
		# ex. 'text/html; charset=UTF-8'
		resource_mimetype = resource_mimetype.split(';')[0]
		return resource_mimetype

	def getFilenameAndExtension(self):
		resource_extension = None
		if self.mime:
			resource_extension = mimetypes.guess_extension(self.mime)
		if not resource_extension:
			guessed_mime = mimetypes.guess_type(self.url)[0]
			if guessed_mime:
				resource_extension = mimetypes.guess_extension(guessed_mime)
		# Parsing url and extracting filename and file_ext.
		# TODOs:
		# Look if the following os module functions work on windows also,
		# as windows path separator is '\'.
		url_parsed = urlparse(self.url)
		filename, file_ext = splitext(basename(url_parsed.path))
		if not resource_extension:
			resource_extension = file_ext
		if not resource_extension:
			self.log.info('Extension could not be guessed for url: %s' % self.url)
			resource_extension = '.none'
		return filename, resource_extension

	"""
	Parses the html structure using beautiful soup.
	"""
	def parseHtml(self):
		try:
			self.soup = BeautifulSoup(self.content, "html5lib")
		except:
			log.exception('Failed to parse: %s' % self.url)
			self.soup = None

	def render_updated_html(self):
		return _to_unicode(self.soup.encode_contents())

	def contents_as_unicode(self):
		return _to_unicode(self.content)

	def cache_style_content(self, content, inline=False):
		"""
		Caches all required URI's and Imports.
		Returns,
		- updated css content
		"""
		if inline:
			sheet = cssutils.parseStyle(content)
		else:
			sheet = cssutils.parseString(content, href=self.url)
		if not inline:
			for rule in sheet.cssRules:
				if rule.type == rule.IMPORT_RULE:
					f = self._recursive_cache_resource(rule.styleSheet.href)
					rule.href = f
		def replacer(url):
			if url.startswith('data'):
				return url
			# TODOs:
			# Check for absolute url before joining
			return self._recursive_cache_resource(urljoin(self.url, url))
		cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)
		return sheet.cssText

	def serialize(self):
		if self._is_stylesheet():
			self.content = self.cache_style_content(self.content)
		if self._is_image() or self._is_generic_mime():
			f = open(self.base_storage + self.filename, "wb")
			f.write(self.content)
			f.close()
		else:
			f = codecs.open(self.base_storage + self.filename, "w", "utf-8-sig")
			f.write(_to_unicode(self.content))
			f.close()

	@parsed
	def serializeUpdated(self):
		f = codecs.open(self.base_storage + self.filename, "w", "utf-8-sig")
		f.write(_to_unicode(self.soup.encode_contents()))
		f.close()

	@parsed
	def update_node_references(self):
		# Getting the source (src) attribute corrected
		node_list = self.soup.find_all(src=re.compile(''))
		for node in node_list:
			link_attr = node.get('src')
			if not self._is_absolute(link_attr):
				node.attrs['src'] = urljoin(self.url, link_attr);

		# Getting the hyper-reference (href) attribute corrected
		node_list = self.soup.find_all(href=re.compile(''))
		for node in node_list:
			link_attr = node.get('href')
			if not self._is_absolute(link_attr):
				node.attrs['href'] = urljoin(self.url, link_attr);

		self.updated_references = True

	@parsed
	@updated_references
	def cache_resources(self):
		# Updaing the link tag
		for link in self.soup.find_all('link', rel=re.compile('stylesheet|icon')):
			f = self._recursive_cache_resource(link.get('href'))
			if f is not None:
				link.attrs['href'] = f
		# Updating the src tag
		for tag in self.soup.find_all(src=re.compile('')):
			f = self._recursive_cache_resource(tag.get('src'))
			if f is not None:
				tag.attrs['src'] = f
		# Looking over the style attribute
		for tag in self.soup.find_all(style=re.compile('')):
			css = self.cache_style_content(tag.get('style'), inline=True)
			if css is not None:
				tag.attrs['style'] = css
		# The <style> tag
		for link in self.soup.find_all('style'):
			css = self.cache_style_content(link.text)
			if css is not None:
				link.string = css

	def cache(self):
		self.cache_resources()
		self.filename = 'index.html'
		self.serializeUpdated()

	def __init__(self, url, base_storage='cache/', readable=False, user_agent='Mozilla/5.0', log='websnip.log'):
		super(WebResource, self).__init__()
		self.url = url
		self.readable = readable
		self.base_storage = base_storage
		self.user_agent = user_agent
		self.log = Log(log)

		# TODOs:
		# Handle different types URL opening errors like if timeout, then retry.
		try:
			self.url_opener = urllib2.build_opener()
			self.url_opener.addheaders = [('User-agent', self.user_agent)]
			self.response = self.url_opener.open(self.url)
			self.content = self.response.read()
			self.mime = self.getMime()
			h = hashlib.md5()
			h.update(self.content)
			self.hash = h.hexdigest()
		except:
			self.response = None
			self.content = None
			self.mime = None
			self.hash = None

		self.filebase, self.extension = self.getFilenameAndExtension()
		if self.hash:
			if self.readable:
				self.filebase = self.filebase + '-' + self.hash[:8] # First 7 characters of md5 hash
			else:
				self.filebase = self.hash
		self.filename = self.filebase + self.extension

		self.soup = None
		self.updated_references = False

	@deprecated
	@parsed
	def updateNodeReferences(self, node, ref):
		for link in self.soup.find_all(node):
			link_attr = link.get(ref)
			if not self._is_absolute(link_attr):
				link.attrs[ref] = urljoin(self.url, link_attr);

	@deprecated
	@parsed
	def updateReferences(self):
		self.updateNodeReferences('a', 'href')
		self.updateNodeReferences('a', 'src')
		self.updateNodeReferences('link', 'href')
		self.updateNodeReferences('img', 'src')
		self.updateNodeReferences('script', 'src')
		self.updated_references = True

	@deprecated
	@parsed
	@updated_references
	def cacheNodeReferences(self, node, ref):
		for link in self.soup.find_all(node):
			link_attr = link.get(ref)
			if link_attr:
				r = WebResource(link_attr, self.base_storage, self.user_agent, self.log)
				r.serialize()
				link.attrs[ref] = r.filename

	@deprecated
	@parsed
	@updated_references
	def cacheReferencedResources(self):
		self.cacheNodeReferences('link', 'href')
		self.cacheNodeReferences('img', 'src')
		self.cacheNodeReferences('script', 'src')
Ejemplo n.º 35
0
def changetext(num):
    for i in range(0, num):

        try:
            driver.get(tkPage)
            if i >= 1:
                driver.find_element_by_tag_name('body').click()
                driver.find_element_by_tag_name('body').send_keys(Keys.SPACE)
                # u'\ue00d'
                time.sleep(5)
            step2 = wait.until(
                EC.element_to_be_clickable(
                    (By.ID, 'w0-data-table-grid-row[' + str(i) + ']-w0')))
            step2.click()

            url = driver.current_url
            # print url
            print 'ID OF PRODUCT:', re.findall(r'\d{12}$', url)

            step3 = wait.until(
                EC.element_to_be_clickable((By.LINK_TEXT, 'HTML')))
            step3.click()

            try:
                driver.find_element_by_id('v4-22txtEdit_ht')
                driver.switch_to.frame('v4-22txtEdit_ht')
                print 'frame:v4-22txtEdit_ht'
            except NoSuchElementException:
                try:
                    driver.find_element_by_id('v4-20txtEdit_ht')
                    driver.switch_to.frame('v4-20txtEdit_ht')
                    print 'frame:v4-20txtEdit_ht'
                except NoSuchElementException:
                    try:
                        driver.find_element_by_id('v4-46txtEdit_ht')
                        driver.switch_to.frame('v4-46txtEdit_ht')
                        print 'frame:v4-46txtEdit_ht'
                    except NoSuchElementException:
                        try:
                            driver.find_element_by_id('v4-43txtEdit_ht')
                            driver.switch_to.frame('v4-43txtEdit_ht')
                            print 'frame:v4-43txtEdit_ht'
                        except NoSuchElementException:
                            try:
                                driver.find_element_by_id('v4-26txtEdit_ht')
                                driver.switch_to.frame('v4-26txtEdit_ht')
                                print 'frame:v4-26txtEdit_ht'
                            except NoSuchElementException:
                                try:
                                    driver.find_element_by_id(
                                        'v4-32txtEdit_ht')
                                    driver.switch_to.frame('v4-32txtEdit_ht')
                                    print 'frame:v4-32txtEdit_ht'
                                except NoSuchElementException:
                                    try:
                                        driver.find_element_by_id(
                                            'v4-29txtEdit_ht')
                                        driver.switch_to.frame(
                                            'v4-29txtEdit_ht')
                                        print 'frame:v4-29txtEdit_ht'
                                    except NoSuchElementException:
                                        try:
                                            driver.find_element_by_id(
                                                'v4-23txtEdit_ht')
                                            driver.switch_to.frame(
                                                'v4-23txtEdit_ht')
                                            print 'frame:v4-23txtEdit_ht'
                                        except NoSuchElementException:
                                            try:
                                                driver.find_element_by_id(
                                                    'v4-28txtEdit_ht')
                                                driver.switch_to.frame(
                                                    'v4-28txtEdit_ht')
                                                print 'frame:v4-28txtEdit_ht'
                                            except NoSuchElementException:

                                                try:
                                                    driver.find_element_by_id(
                                                        'v4-47txtEdit_ht')
                                                    driver.switch_to.frame(
                                                        'v4-47txtEdit_ht')
                                                    print 'frame:v4-47txtEdit_ht'
                                                except NoSuchElementException:

                                                    try:
                                                        driver.find_element_by_id(
                                                            'v4-16txtEdit_ht')
                                                        driver.switch_to.frame(
                                                            'v4-16txtEdit_ht')
                                                        print 'frame:v4-16txtEdit_ht'
                                                    except NoSuchElementException:

                                                        try:
                                                            driver.find_element_by_id(
                                                                'v4-25txtEdit_ht'
                                                            )
                                                            driver.switch_to.frame(
                                                                'v4-25txtEdit_ht'
                                                            )
                                                            print 'frame:v4-25txtEdit_ht'
                                                        except NoSuchElementException:

                                                            try:
                                                                driver.find_element_by_id(
                                                                    'v4-5txtEdit_ht'
                                                                )
                                                                driver.switch_to.frame(
                                                                    'v4-5txtEdit_ht'
                                                                )
                                                                print 'frame:v4-5txtEdit_ht'
                                                            except NoSuchElementException:
                                                                try:
                                                                    driver.find_element_by_id(
                                                                        'v4-35txtEdit_ht'
                                                                    )
                                                                    driver.switch_to.frame(
                                                                        'v4-35txtEdit_ht'
                                                                    )
                                                                    print 'frame:v4-35txtEdit_ht'
                                                                except NoSuchElementException:
                                                                    try:
                                                                        driver.find_element_by_id(
                                                                            'v4-19txtEdit_ht'
                                                                        )
                                                                        driver.switch_to.frame(
                                                                            'v4-19txtEdit_ht'
                                                                        )
                                                                        print 'frame:v4-19txtEdit_ht'
                                                                    except NoSuchElementException:

                                                                        try:
                                                                            driver.find_element_by_id(
                                                                                'v4-38txtEdit_ht'
                                                                            )
                                                                            driver.switch_to.frame(
                                                                                'v4-38txtEdit_ht'
                                                                            )
                                                                            print 'frame:v4-38txtEdit_ht'
                                                                        except NoSuchElementException:

                                                                            try:
                                                                                driver.find_element_by_id(
                                                                                    'v4-13txtEdit_ht'
                                                                                )
                                                                                driver.switch_to.frame(
                                                                                    'v4-13txtEdit_ht'
                                                                                )
                                                                                print 'frame:v4-13txtEdit_ht'
                                                                            except NoSuchElementException:

                                                                                try:
                                                                                    driver.find_element_by_id(
                                                                                        'v4-17txtEdit_ht'
                                                                                    )
                                                                                    driver.switch_to.frame(
                                                                                        'v4-17txtEdit_ht'
                                                                                    )
                                                                                    print 'frame:v4-17txtEdit_ht'
                                                                                except NoSuchElementException:

                                                                                    driver.find_element_by_id(
                                                                                        'v4-34txtEdit_ht'
                                                                                    )
                                                                                    driver.switch_to.frame(
                                                                                        'v4-34txtEdit_ht'
                                                                                    )
                                                                                    print 'frame:v4-34txtEdit_ht'

            content1 = driver.find_element_by_tag_name('body').text

            # Added for store,because it used <a> as anchor, but not supported in html5 2017.9.27
            content = re.sub(r'<(a|\/a).*?>', "", content1)

            soup = BeautifulSoup(content, "lxml")

            links = soup.find_all('a')

            for link in links:
                # print type(link)
                link['target'] = '_blank'

            print 'link target is all _blank'

            try:

                # [link.extract() for link in soup.find_all('a', href=re.compile('xxxxx'))]
                [
                    x.parent.extract()
                    for x in soup.findAll('img', {
                        'src':
                        'http://www.xxxxxs.com/fpdb/images/Logo_Ny62.jpg'
                    })
                ]
                [
                    x.parent.extract()
                    for x in soup.findAll('img', {
                        'src':
                        'http://www.xxxxxs.com/images/Risk_Free_logo.gif'
                    })
                ]
                [
                    x.parent.extract()
                    for x in soup.findAll('img', {
                        'src':
                        'http://www.xxxxxs.com/images/contact_email.jpg'
                    })
                ]

                # [x.parent.extract() for x in soup.findAll('img', {'src': 'http://www.xxxxxs.com/images/contact_email.jpg'})]
            except:
                print 'Category is wrong'
            # modifiedtxt1 = str(soup)
            modifiedtxt = soup.encode_contents(formatter='html')
            #############################################################

            copy(modifiedtxt)
            try:
                driver.find_element_by_tag_name('body').clear()
                driver.find_element_by_tag_name('body').send_keys(
                    Keys.CONTROL, 'v')
            except Exception as e1:
                print 'bad:', e1


# #############################################################
#             # update and save
#
            driver.switch_to.default_content()
            step4 = wait.until(
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, 'div#actionbar > input')))
            step4.click()
            # print 'step4 ok'
            time.sleep(10)

            step5 = wait.until(
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, 'div#confirm_button_wrap > input')))
            step5.click()
            print i, ' is Ok'
            with open('picturefolder.txt', 'a') as f:
                f.writelines('Ok')

        ############################################################

        except Exception as e3:

            print 'error in first, i is', i
            print e3
            time.sleep(3)
            continue