Ejemplo n.º 1
0
def get_html(url):
    true_url = parse_qs(url)['url'][0]
    document = pq(url=true_url)
    article_text = clean_html(document('div#article_body'))
    article_text = strip_tags(article_text)
    article_text = article_text.replace('
', '')
    return article_text
    def dump_studentmodules(self, module, display_header, display_prompt, deanonymize):
        '''Identify the list of StudentModule objects of combinedopenended type that belong to the specified module_id'''
        module = UsageKey.from_string(module)
        modules = StudentModule.objects.filter(
                      module_state_key=module,
                      module_type='combinedopenended'
                  )

        filename = "{0}.html".format(module).replace(':','-').replace('/','-')
        
        with io.StringIO() as handle:
            handle.write(u'<html><head></head><body>')
            handle.write(u'<h1>Задание "{0}"</h1>\n\n'.format(display_header))
            handle.write(u'<p>{0}</p>\n\n'.format(display_prompt))
            for module in modules:
                self.dump_studentmodule_answer(module, handle, deanonymize)
            handle.write(u'</body></html>')
            filedata = handle.getvalue()
        
        soup = BeautifulSoup(clean_html(filedata))
        metatag = soup.new_tag('meta')
        metatag.attrs['charset'] = 'UTF-8'
        soup.head.append(metatag)

        return (filename, u"<!DOCTYPE html>\n"+soup.prettify())
def write_rst(request, rst_template, context, filename=None):
    if not filename:
        filename = _get_default_filename()
    rst_filename = '%s.rst'%filename
    destination = os.path.join(settings.MEDIA_ROOT, 'resume_download')
    destination_rst = os.path.join(destination, rst_filename)
    
    if not os.path.exists(destination):
        os.makedirs(destination)
        
    with open(destination_rst, 'w+') as f:
        t = loader.get_template(rst_template)
        rst_content = html.clean_html(t.render(Context(context)))
        logger.debug("Writing %s bytes to %s"%(len(rst_content),
            destination_rst))
        logger.debug("RST content:\n%s"%rst_content)
        try:
            f.write(rst_content)
        except UnicodeEncodeError as e:
            logger.error(rst_content[e.start:e.end])
            logger.error(rst_content[e.start-20:e.end+20])
            logger.error("%d, %d"%(e.start, e.end))
            f.close()
        f.close()
        
    return destination_rst
    def dump_studentmodules(self, module, display_header, display_prompt,
                            deanonymize):
        '''Identify the list of StudentModule objects of combinedopenended type that belong to the specified module_id'''
        module = UsageKey.from_string(module)
        modules = StudentModule.objects.filter(module_state_key=module,
                                               module_type='combinedopenended')

        filename = "{0}.html".format(module).replace(':',
                                                     '-').replace('/', '-')

        with io.StringIO() as handle:
            handle.write(u'<html><head></head><body>')
            handle.write(u'<h1>Задание "{0}"</h1>\n\n'.format(display_header))
            handle.write(u'<p>{0}</p>\n\n'.format(display_prompt))
            for module in modules:
                self.dump_studentmodule_answer(module, handle, deanonymize)
            handle.write(u'</body></html>')
            filedata = handle.getvalue()

        soup = BeautifulSoup(clean_html(filedata))
        metatag = soup.new_tag('meta')
        metatag.attrs['charset'] = 'UTF-8'
        soup.head.append(metatag)

        return (filename, u"<!DOCTYPE html>\n" + soup.prettify())
Ejemplo n.º 5
0
def products_info():
    results = tb_category_info.select().where(pid__ne = '0').execute()
    for result in results:
        url_path = PRODUCTS_BASE_URL + result.cid + '/' + result.url
        while url_path:
            print url_path
            resp = session.get(url_path)
            url_path = next_pattern.findall(resp.text)
            if url_path:
                url_path = url_path[0]
                urls = product_url_pattern.findall(resp.text)
                for url in urls:
                    try:
                        product_info = {}
                        print url
                        product_id, product_name = product_id_pattern.findall(url)[0]
                        resp = session.get(url)
    #                    groups = company_info_pattern.findall(resp.text)
                        company_id = company_id_pattern.findall(resp.text)[0]
                        product_description = product_description_pattern.findall(resp.text)[0].strip()
                        product_info['cid'] = result.cid
                        product_info['product_id'] = product_id
                        product_info['company_id'] = company_id
                        product_info['product_name'] = product_name.strip()
                        product_info['description'] = clean_html(product_description)
                        print product_info
                        product_info_db = tb_product_info(**product_info)
                        product_info_db.save()
                    except Exception:
                        print traceback.format_exc()
                        with open('product_error.txt', 'a') as FILE:
                            FILE.write(url + '\n')
                        print '出错'
Ejemplo n.º 6
0
 def save(self, force_insert=False, force_update=False):
     from django.utils.html import strip_tags, clean_html
     from django.utils.text import truncate_words
     
     self.html = clean_html(self.html)
     if not self.anounce and self.html:
         self.anounce = truncate_words(strip_tags(self.html), 100)
     
     super(News, self).save(force_insert, force_update) # Call the "real" save() method.
Ejemplo n.º 7
0
    def save(self, *args, **kwargs):
        self.content = clean_html(self.content)

        super(Post, self).save(*args, **kwargs)

        # Initial the views and comments count to 0 if the PostMeta isn't available
        pm, created = PostMeta.objects.get_or_create(post=self, meta_key='views')
        if created:
            pm.meta_value = '0'
            pm.save()

        pm, created = PostMeta.objects.get_or_create(post=self, meta_key='comments_count')
        if created:
            pm.meta_value = '0'
            pm.save()
Ejemplo n.º 8
0
    def save(self):
        try:
            self.content = html.clean_html(self.content)
        except:
            pass
        super(Post, self).save()

        # Initial the views and comments count to 0 if the PostMeta isn't available
        pm, created = PostMeta.objects.get_or_create(post=self, meta_key='views')
        if created:
            pm.meta_value = '0'
            pm.save()

        pm, created = PostMeta.objects.get_or_create(post=self, meta_key='comments_count')
        if created:
            pm.meta_value = '0'
            pm.save()
Ejemplo n.º 9
0
def render_content(content, text_type, images=None):
    try:
        if not content:
            return ''

        text_type = int(text_type)
        
        # the big bad switch ;)
        if text_type == MARKUP_PLAIN_TEXT:
            ret = html.linebreaks(html.escape(content))
        elif text_type == MARKUP_HTML:
            ret = html.clean_html(content)
        elif text_type == MARKUP_TEXTILE:
            ret = markup.textile(content)
        elif text_type == MARKUP_MARKDOWN:
            ret = markup.markdown(content)
        elif text_type == MARKUP_REST:
            ret = markup.restructuredtext(content)
        else:
            # this can never happen
            return 'UNKNOWN CONTENT %d' % text_type

    except template.TemplateSyntaxError, err:
        return 'ERROR: %s' % err
Ejemplo n.º 10
0
def changelog_entry(request, slug):
    slug = clean_html(slug)
    change = get_object_or_404(models.Change, slug = slug)
    context = PageContext(request, "Change:%s" %change.title, d = locals())
    return direct_to_template(request, template = 'changelog/change.html', extra_context = context)
Ejemplo n.º 11
0
 def custom_cliente(self, obj):
     return clean_html("<a href='#'>%s</a>" % obj.cliente)