def to_html(self):
     html = []
     html_examples = self.get_examples_html()
     html_options  = self.get_options_html()
     
     message = "<div>Your friend <b>likes</b> these 3 restaurants : </div>"
     option_message = "<div>Among the following 3 restaurants, which would you recommend most:</div>"
     html.append(message)
     html.extend(html_examples)
     html.append(option_message)
     html.extend(html_options)
     
     return (html_examples, html_options)
def store_htmls_in_list():
	urls = get_urls_from_text()
	widget = ['Getting link html: ', Percentage(), Bar(), ETA(), ' ']
	pbar = ProgressBar(widgets=widget, maxval=len(urls)).start()
	i=0
	html = []
	for l_url in urls:
		url = l_url.encode("ascii")
		html.append(get_html_from_website(url))				
		sleep(0.001)
        	pbar.update(i)
		i+=1
	pbar.finish()
	return html	
Example #3
0
    def render_many(self, documents, **kwargs):
        html = []

        with tempfile.TemporaryDirectory() as tmpdir:
            # render individual documents
            for i, doc in enumerate(documents):
                self.media_url = f'doc-{i}/'
                doc_html = super(PDFRenderer, self).render(doc, **kwargs)
                self.save_attachments(doc_html, doc, f'doc-{i}/media/', tmpdir)

                html.append(doc_html)

            # combine and embed the HTML into the PDF container
            html = render_to_string('indigo_api/akn/export/pdf.html', {
                'documents': list(zip(documents, html)),
            })

            return self.to_pdf(html, tmpdir, documents=documents)
Example #4
0
  def adddivstodoc(self, thtml):
    """
    put divs around headers
    """
    from lxml import etree, html

    oldbody = html.fromstring('<body>\n' + thtml + '\n</body>')
    newbody = html.fromstring('<html>\n</html>')
    activediv = None
    for child in oldbody.iter():
      if child.getparent() == oldbody:
        if child.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
          if activediv != None:
            newbody.append(activediv)
            activediv = None

          activediv = etree.fromstring('<div class="indent%s"></div>' \
                                                  % child.tag)
          activediv.append(copy.deepcopy(child))

        elif activediv != None:
          activediv.append(copy.deepcopy(child))
        else:
          newbody.append(copy.deepcopy(child))

    if activediv != None:
      newbody.append(activediv)

    htmlout = etree.tostring(newbody, pretty_print=True)

    html = htmlout.split('\n')

    if '<html>' == html[0]:
      html.pop(0)
    while html[-1] == '':
      html.pop()
    lastelem = html.pop()
    lastelem = lastelem.replace('</html>', '')
    if lastelem:
      html.append(lastelem)

    return '\n'.join(html)
Example #5
0
  def adddivstodoc(thtml):
    """
    put divs around headers
    """
    from lxml import etree, html

    oldbody = html.fromstring('<body>\n' + thtml + '\n</body>')
    newbody = html.fromstring('<html>\n</html>')
    activediv = None
    for child in oldbody.iter():
      if child.getparent() == oldbody:
        if child.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
          if activediv != None:
            newbody.append(activediv)
            activediv = None

          activediv = etree.fromstring('<div class="indent%s"></div>' \
                                                  % child.tag)
          activediv.append(copy.deepcopy(child))

        elif activediv != None:
          activediv.append(copy.deepcopy(child))
        else:
          newbody.append(copy.deepcopy(child))

    if activediv != None:
      newbody.append(activediv)

    htmlout = etree.tostring(newbody, pretty_print=True)

    html = htmlout.split('\n')

    if html[0] == '<html>':
      html.pop(0)
    while html[-1] == '':
      html.pop()
    lastelem = html.pop()
    lastelem = lastelem.replace('</html>', '')
    if lastelem:
      html.append(lastelem)

    return '\n'.join(html)
Example #6
0
    def process_response(self, request, response):
        if (request.META.get('HTTP_X_PJAX') and
            response.status_code == 200 and
            'html' in response.get('content-type', '').lower()):
            # TODO(Kumar) cache this.
            with statsd.timer('pjax.parse'):
                tree = lxml.html.document_fromstring(response.content)
                # HTML is encoded as ascii with entity refs for non-ascii.
                html = []
                found_pjax = False
                for elem in tree.cssselect('title,%s'
                                           % settings.PJAX_SELECTOR):
                    if elem.tag == 'title':
                        # Inject a <title> for jquery-pjax
                        html.append(lxml.html.tostring(elem, encoding=None))
                    else:
                        found_pjax = True
                        if elem.text:
                            html.append(elem.text.encode('ascii',
                                                         'xmlcharrefreplace'))
                        for ch in elem.iterchildren():
                            html.append(lxml.html.tostring(ch, encoding=None))
                if not found_pjax:
                    msg = ('pjax response for %s does not contain selector %r'
                           % (request.path, settings.PJAX_SELECTOR))
                    if settings.DEBUG:
                        # Tell the developer the template is bad.
                        raise ValueError(msg)
                    else:
                        pjax_log.error(msg)
                        return response

                response.content = ''.join(html)

        return response
Example #7
0
    def process_response(self, request, response):
        if (request.META.get('HTTP_X_PJAX') and
            response.status_code == 200 and
            'html' in response.get('content-type', '').lower()):
            # TODO(Kumar) cache this.
            with statsd.timer('pjax.parse'):
                tree = lxml.html.document_fromstring(response.content)
                # HTML is encoded as ascii with entity refs for non-ascii.
                html = []
                found_pjax = False
                for elem in tree.cssselect('title,%s'
                                           % settings.PJAX_SELECTOR):
                    if elem.tag == 'title':
                        # Inject a <title> for jquery-pjax
                        html.append(lxml.html.tostring(elem, encoding=None))
                    else:
                        found_pjax = True
                        if elem.text:
                            html.append(elem.text.encode('ascii',
                                                         'xmlcharrefreplace'))
                        for ch in elem.iterchildren():
                            html.append(lxml.html.tostring(ch, encoding=None))
                if not found_pjax:
                    msg = ('pjax response for %s does not contain selector %r'
                           % (request.path, settings.PJAX_SELECTOR))
                    if settings.DEBUG:
                        # Tell the developer the template is bad.
                        raise ValueError(msg)
                    else:
                        pjax_log.error(msg)
                        return response

                response.content = ''.join(html)

        return response
Example #8
0
def diff_prettyHtml(self, diffs):
    """Convert a diff array into a pretty HTML report.

    Args:
      diffs: Array of diff tuples.

    Returns:
      HTML representation.
    """
    html = []
    ct = 1
    for (op, data) in diffs:
        text = (data.replace("&", "&amp;").replace("<", "&lt;").replace(
            ">", "&gt;").replace("\n", "<br>"))
        if op == self.DIFF_INSERT:
            html.append("<ins style=\"background:#e6ffe6;\">%s</ins>" % text)
        elif op == self.DIFF_DELETE:
            html.append("<del style=\"background:#ffe6e6;\">%s</del>" % text)
        elif op == self.DIFF_EQUAL:
            html.append("<span>%s</span>" % text)
    return "".join(html)
Example #9
0
    current = node
    count = 0
    while current is not None:
        css_class = current.attrib.get('class', '')
        if 'topicref' in css_class:
            count += 1
        current = current.getparent()
    return count

with open(full_fn, 'rb') as fp:
    root = lxml.html.fromstring(fp.read())

selector = CSSSelector('li.topicref')

html = lxml.etree.Element('html')
html.append(lxml.etree.Element('head'))
body = lxml.etree.Element('body')
html.append(body)

for topicref in selector(root):
    first_link = topicref.find('a')
    topic_href = first_link.attrib['href']
    topic_title = first_link.text
    topic_level = hierarchy_level(topicref)

    article_node = article_from_href(topic_href)
    article_node.attrib['level'] = str(topic_level)
    body.append(article_node)

with open(target_fn, 'wb') as fp:
    fp.write(lxml.html.tostring(html))
threads = tree.xpath("//div[contains(@class, 'thread')]")

#make a list of the user names available to select from, assign correct one to variable
for thread in threads:
	names.append(thread.text)
	if thread.text == "REPLACEDUSERNAMES":
		courtenaythread = thread

html = []
messages = courtenaythread.xpath("//div[text()[contains(., 'REPLACEDUSERNAMES')]]/*")

for message in messages:
	#while n < 50:
	#print message.keys()
	if message.keys() == ['class']:
		html.append("<hr>")
		html.append("Sent by")
		html.append(message[0][0].text)
		html.append("<br>")
		html.append("Date and Time")
		html.append(message[0][1].text)
		
	else:
		html.append("<br>")
		html.append("Message:")
		html.append(message.text)
		#n =+ 1

	#html.append(message[0][0].text)
	#html.append(message[0][1].text)
	#html.append(message.text)
]

# WARNING LETTERS: http://google2.fda.gov/search?client=FDAgov&site=FDAgov-WarningLetters-ICECI&output=xml_no_dtd&proxystylesheet=FDAgov&ie=UTF-8&oe=UTF-8&as_q=&num=100&btnG=Search&as_epq=COMPANY+NAME&as_oq=&as_eq=&restrictBox=FDAgov-WarningLetters-ICECI&lr=&as_ft=i&as_filetype=&as_occt=any&as_dt=i&as_sitesearch=&sort=
# ENFORCEMENT REPORTS http://google2.fda.gov/search?client=FDAgov&site=FDAgov-EnforcementReports-Safety&output=xml_no_dtd&proxystylesheet=FDAgov&ie=UTF-8&oe=UTF-8&as_q=++&num=100&btnG=Search&as_epq=COMPANY+NAME&as_oq=&as_eq=&restrictBox=FDAgov-EnforcementReports-Safety&lr=&as_ft=i&as_filetype=&as_occt=any&as_dt=i&as_sitesearch=&sort=

SearchURL1First = "http://google2.fda.gov/search?client=FDAgov&site=FDAgov-WarningLetters-ICECI&output=xml_no_dtd&proxystylesheet=FDAgov&ie=UTF-8&oe=UTF-8&as_q=&num=100&btnG=Search&as_epq="
SearchURL1Second = "&as_oq=&as_eq=&restrictBox=FDAgov-WarningLetters-ICECI&lr=&as_ft=i&as_filetype=&as_occt=any&as_dt=i&as_sitesearch=&sort="

SearchURL2First = "http://google2.fda.gov/search?client=FDAgov&site=FDAgov-EnforcementReports-Safety&output=xml_no_dtd&proxystylesheet=FDAgov&ie=UTF-8&oe=UTF-8&as_q=++&num=100&btnG=Search&as_epq="
SearchURL2Second = "&as_oq=&as_eq=&restrictBox=FDAgov-EnforcementReports-Safety&lr=&as_ft=i&as_filetype=&as_occt=any&as_dt=i&as_sitesearch=&sort="

html = []
root = []

for z in companies:
    html.append(scraperwiki.scrape(SearchURL1First + z + SearchURL1Second))
    html.append(scraperwiki.scrape(SearchURL2First + z + SearchURL2Second))

for x in html:
    root.append(lxml.html.fromstring(x))

for y in root:
    for el in y.cssselect("p.g a"):
        data = {'URL': el.attrib['href']}
        scraperwiki.sqlite.save(unique_keys=['URL'], data=data)

import scraperwiki
import lxml.html

companies = [
    'AgrIcola+Daniella', 'Agricola+El+Consuelo', 'Agricola+El+Rosal',
Example #12
0
    count = 0
    while current is not None:
        css_class = current.attrib.get('class', '')
        if 'topicref' in css_class:
            count += 1
        current = current.getparent()
    return count


with open(full_fn, 'rb') as fp:
    root = lxml.html.fromstring(fp.read())

selector = CSSSelector('li.topicref')

html = lxml.etree.Element('html')
html.append(lxml.etree.Element('head'))
body = lxml.etree.Element('body')
html.append(body)

for topicref in selector(root):
    first_link = topicref.find('a')
    topic_href = first_link.attrib['href']
    topic_title = first_link.text
    topic_level = hierarchy_level(topicref)

    article_node = article_from_href(topic_href)
    article_node.attrib['level'] = str(topic_level)
    body.append(article_node)

with open(target_fn, 'wb') as fp:
    fp.write(lxml.html.tostring(html))
    parser = lxml.etree.HTMLParser()
    tree = lxml.etree.parse(content, parser)
    return tree

tree = parse_html(file)
names = []
threads = tree.xpath("//div[contains(@class, 'thread')]")
conversations = {}
#make a list of the user names available to select from, assign correct one to variable
n=0
for thread in threads:
	html = []
	for message in thread:
		if message.keys() == ['class']:
			#html.append("<hr>")
			html.append("<br> Sent by: " + message[0][0].text)
			#html.append(message[0][0].text)
			#html.append("<br>")
			html.append("Date and Time: " +message[0][1].text)
		
		else:
			#html.append("<br>")
			html.append("Message:")
			html.append(message.text)
	conversations[thread.text] = html

courtneyconvos = {}
courtneykey = []
def find_message_content(content,fileout):
	for key in conversations.keys():
		for i in conversations.get(key):	
Example #14
0
 def render_log(self, source, formatter, prev_url, next_url):
     html = [
         DOC_HEADER % dict(title='',
                           styledefs=formatter.get_style_defs('body'),
                           encoding='utf-8'),
     ]
     if prev_url:
         html.append(u'<a href="%s">Zurück</a>' % (prev_url, ))
     if next_url:
         html.append(u'<a href="%s">Weiter</a>' % (next_url, ))
     html.append(highlight(source, IrcLogsLexer(), formatter))
     if prev_url:
         html.append(u'<a href="%s">Zurück</a>' % (prev_url, ))
     if next_url:
         html.append(u'<a href="%s">Weiter</a>' % (next_url, ))
     html.append(DOC_FOOTER)
     return ''.join(html)
'Custom+Pak']

# WARNING LETTERS: http://google2.fda.gov/search?client=FDAgov&site=FDAgov-WarningLetters-ICECI&output=xml_no_dtd&proxystylesheet=FDAgov&ie=UTF-8&oe=UTF-8&as_q=&num=100&btnG=Search&as_epq=COMPANY+NAME&as_oq=&as_eq=&restrictBox=FDAgov-WarningLetters-ICECI&lr=&as_ft=i&as_filetype=&as_occt=any&as_dt=i&as_sitesearch=&sort=
# ENFORCEMENT REPORTS http://google2.fda.gov/search?client=FDAgov&site=FDAgov-EnforcementReports-Safety&output=xml_no_dtd&proxystylesheet=FDAgov&ie=UTF-8&oe=UTF-8&as_q=++&num=100&btnG=Search&as_epq=COMPANY+NAME&as_oq=&as_eq=&restrictBox=FDAgov-EnforcementReports-Safety&lr=&as_ft=i&as_filetype=&as_occt=any&as_dt=i&as_sitesearch=&sort=

SearchURL1First = "http://google2.fda.gov/search?client=FDAgov&site=FDAgov-WarningLetters-ICECI&output=xml_no_dtd&proxystylesheet=FDAgov&ie=UTF-8&oe=UTF-8&as_q=&num=100&btnG=Search&as_epq="
SearchURL1Second = "&as_oq=&as_eq=&restrictBox=FDAgov-WarningLetters-ICECI&lr=&as_ft=i&as_filetype=&as_occt=any&as_dt=i&as_sitesearch=&sort="

SearchURL2First = "http://google2.fda.gov/search?client=FDAgov&site=FDAgov-EnforcementReports-Safety&output=xml_no_dtd&proxystylesheet=FDAgov&ie=UTF-8&oe=UTF-8&as_q=++&num=100&btnG=Search&as_epq="
SearchURL2Second = "&as_oq=&as_eq=&restrictBox=FDAgov-EnforcementReports-Safety&lr=&as_ft=i&as_filetype=&as_occt=any&as_dt=i&as_sitesearch=&sort="

html = []
root = []

for z in companies:
    html.append(scraperwiki.scrape(SearchURL1First + z + SearchURL1Second))
    html.append(scraperwiki.scrape(SearchURL2First + z + SearchURL2Second))

for x in html:
    root.append(lxml.html.fromstring(x))

for y in root:
    for el in y.cssselect("p.g a"):
        data = {
            'URL' : el.attrib['href']
        }
        scraperwiki.sqlite.save(unique_keys=['URL'], data=data)


import scraperwiki
import lxml.html