Example #1
0
def make_links_readable(html):
    """
    Goes through links making them readable
    If they are too long, they are turned into goo.gl links
    timing stats:
    before multiprocess = 0m18.063s
    """
    soup = BeautifulSoup(html)
    for link in soup.findAll('a'):  #links:
        oldlink = link
        if link and len(link.get('href', '')) > 90 and options.use_short_links:
            #make into goo.gl link
            short_link = shorten_link(soup, link)
            if short_link != None:
                link = short_link

        if validate_link(link) and link.get('href', None):
            if not link.text:
                oldlink.replaceWith(
                    link.get('href', "No href link to replace with"))
            else:
                div = Tag(soup, 'div')
                div.setString(link.text)
                br = Tag(soup, 'br')
                new_link = Tag(soup, 'a')
                new_link.setString("(%s)" % (link.get('href')))
                div.append(br)
                div.append(new_link)
                oldlink.replaceWith(div)
            print

    return soup
Example #2
0
def anchorArticles(txt):
    # find all textnodes starting with Article, wrapping this in a named <a> and prepending a hoverable link to this anchor
    aregex=re.compile('^\s*Article\s+[0-9][0-9.,]*', re.I)
    nsoup = BeautifulSoup(txt)
    node=nsoup.find(text=aregex)
    while node:
        nodeidx=node.parent.contents.index(node)
        match=str(re.match(aregex,node).group())
        # create named <a>
        name=match.replace(' ','_')
        a=Tag(nsoup,'a',[('name',name)])
        a.insert(0,match)
        # create a link that is displayed if the <a> is hovered
        link=Tag(nsoup,'a', [('class',"anchorLink"), ('href','#'+name)])
        link.insert(0,"#")
        # create a container for the a and the link
        hover=Tag(nsoup,'span',[('class','hover')])
        hover.insert(0,a)
        hover.insert(0,link)
        node.parent.insert(nodeidx,hover)
        # cut the newly wrapped from the original node.
        newNode=NavigableString(node[len(match):])
        node.replaceWith(newNode)
        node=newNode.findNext(text=aregex)
    return str(nsoup)
    def unTag(self, tag):
        """
            recursively removes unwanted tags according to defined lists
            @param tag: tag hierarchy to work on
        """
        for child in tag.findChildren(True, recursive=False):
            self.unTag(child)
        if (self.remove_classes_regexp != "") and (
                tag.has_key("class") and
            (re.match(self.remove_classes_regexp, tag["class"]) != None)):
            tag.extract()
        elif tag.name in self.keep_tags:
            new_tag = Tag(self.input, tag.name)
            new_tag.contents = tag.contents
            tag.replaceWith(new_tag)

        elif tag.name in self.remove_tags_keep_content:
            children = tag.findChildren(True, recursive=False)
            if len(children) == 1:
                tag.replaceWith(children[0])
            elif len(children) > 1:
                new_tag = Tag(self.input, "p")
                for child in tag.findChildren(True, recursive=False):
                    new_tag.append(child)
                tag.replaceWith(new_tag)
            else:
                tag.replaceWith(tag.renderContents())
        else:
            tag.extract()
Example #4
0
    def _set_element(self, root, tagname, text=None, attr=None):
        """Creates if not available an element at the soup root element
        
        :return: tag object or None
        :rtype: Tag
        """

        # Add Topic if not available
        if attr is None:
            if root.find(re.compile(tagname + "$", re.I)) is None:
                new_tag = Tag(self._soup, tagname)
                root.insert(0, new_tag)
        else:
            if root.find(re.compile(tagname + "$", re.I), attr) is None:
                new_tag = Tag(self._soup, tagname, attr.items())
                root.insert(0, new_tag)

        settings = self._soup.find(self.root)
        tag = settings.find(re.compile(tagname + "$", re.I))

        # Something to insert
        if tag is not None and text is not None:
            if tag.text.strip() == "":
                tag.insert(0, NavigableString(text))
            else:
                tag.contents[0].replaceWith(text)

        return tag
Example #5
0
	def setup_source(self):
		source_path = vfs.join('special://profile/', 'sources.xml')
		try:
			soup = vfs.read_file(source_path, soup=True)
		except:
			soup = BeautifulSoup()
			sources_tag = Tag(soup, "sources")
			soup.insert(0, sources_tag)
			
		if soup.find("video") == None:
			sources = soup.find("sources")
			if not sources: return
			video_tag = Tag(soup, "video")
			sources.insert(0, video_tag)
		
		video = soup.find("video")
		if len(soup.findAll(text="PVR Recordings")) < 1:
			pvr_source_tag = Tag(soup, "source")
			pvr_name_tag = Tag(soup, "name")
			pvr_name_tag.insert(0, "PVR Recordings")
			PVR_PATH_tag = Tag(soup, "path")
			PVR_PATH_tag['pathversion'] = 1
			PVR_PATH_tag.insert(0, "pvr://recordings/active/Default/")
			pvr_source_tag.insert(0, pvr_name_tag)
			pvr_source_tag.insert(1, PVR_PATH_tag)
			video.insert(2, pvr_source_tag)
			string = ""
			for i in soup:
				string = string + str(i)

			vfs.write_file(source_path, string)
Example #6
0
def do_iperimage(value):
    '''detects iPernity static urls and creates clickable thumbnail for it'''

    soup = BeautifulSoup(value)
    iprl = re.compile(
        '^(http://\w+\.ipernity\.com/\d+/\d+/\d+/\d+\.\w+\.)(75|100|240|500|560)(\.jpg)$'
    )
    iprl_thumb = '500'
    iprl_zoom = '560'

    for img in soup.findAll('img', src=iprl):

        match = iprl.match(img['src'])
        try:
            thumb = Tag(soup, 'img')
            thumb['alt'] = img['title']
            thumb['src'] = match.group(1) + iprl_thumb + match.group(3)

            link = Tag(soup, 'a')
            link['href'] = match.group(1) + iprl_zoom + match.group(3)
            link['rel'] = 'lightbox'
            link['title'] = img['title']
            link.insert(0, thumb)

            img.replaceWith(link)
        except:
            pass

    return unicode(soup)
Example #7
0
def generate_table(summary):
    soup = BeautifulSoup()
    new_tag_table = Tag(soup, "table")
    new_tag_table["border"] = 1
    new_tag_table["cellspacing"] = 0
    new_tag_table["cellpadding"] = 0
    new_tag_table["bordercolordark"] = "#000000"
    new_tag_table["cellspacing"] = "#ffffff"
    soup.append(new_tag_table)
    new_Tag_tr = Tag(soup, "tr")
    new_Tag_tr["bgcolor"] = "#0072E3"
    new_tag_table.append(new_Tag_tr)
    for i in ["TestSuite", "Passed", "Failed", "Total"]:
        new_Tag_td = Tag(soup, "td")
        new_Tag_td.string = str(i)
        new_Tag_tr.append(new_Tag_td)
    for i in summary:
        new_Tag_tr = Tag(soup, "tr")
        new_tag_table.append(new_Tag_tr)
        for j in i:
            new_Tag_td = Tag(soup, "td")
            new_Tag_td.string = str(j)
            new_Tag_tr.append(new_Tag_td)
    print str(soup.prettify())
    return str(soup.prettify())
def generateContentDivTag(baseDir, h3text):
    import __main__

    contentDivTag = Tag(formatSoup, 'div', attrs={'class' : 'content band-content'})
    # 表題埋め込み
    h3tag = Tag(formatSoup, 'h3')
    h3tag.append(NavigableString(h3text))
    contentDivTag.append(h3tag)


    # HTML生成
    for file in os.listdir(PARENT_DIR + baseDir):
        if file.endswith(SHTML_EXT):
            # バンド名ulタグを生成
            progreUlTag = generateUlTag('/' + baseDir, file, 'column')
            albumLiTag = Tag(formatSoup, 'li')
            progreUlTag.append(albumLiTag)

            # 作品名ulタグを生成
            fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, '/' + baseDir, file])))
            albumList = []
            for albumClassTag in fileSoup.findAll('a', {'class' : 'album-name'}):
                albumList.append(albumClassTag['href'].split('/')[-1])
                __main__.contentCount += 1

            albumDir = '/'.join([baseDir, file.split('.')[0]])
            for album in albumList:
                albumUlTag = generateUlTag('/' + albumDir, album, 'child-column')
                albumLiTag.append(albumUlTag)
            contentDivTag.append(progreUlTag)

    return contentDivTag
def get_first_three(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    for tr in table.findAll("tr"):
        li = Tag(soup, "li")
        for td in tr.findAll("td"):
            if loop != 3:
                try:
                    text = ''.join(td.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != '&nbsp;':
                        td.name = "span"
                        if first == 1:
                            first = 0
                            enclose.append(td)
                        else:
                            if loop != 2: td.append(' - ')
                            li.append(td)
                except:
                    pass
            else:
                break
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    title = enclose.find("span")
    enclose.find("span").replaceWith("")
    enclose.name = "ul"
    div = Tag(soup, "div")
    div.append(title)
    div.append(enclose)
    return div
def get_last_3(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    ul = Tag(soup, "ul")
    for tr in table.findAll("tr"):
        td = tr.findAll("td")
        li = Tag(soup, "li")
        for el in td[3:]:
            if loop != 3:
                try:
                    text = ''.join(el.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != '&nbsp;':
                        el.name = "span"
                        if loop != 2: el.append(' - ')
                        li.append(el)
                except:
                    pass
            else:
                break
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    return enclose
def _tag_generator(soup, name, attrs=[], contents=None):
    if attrs != []:
        new_tag = Tag(soup, name, attrs)
    else:
        new_tag = Tag(soup, name)
    if contents != None:
        new_tag.insert(0, contents)
    return new_tag
Example #12
0
def generate_table_of_contents(soup, prefix):
    header_ids = Counter()
    headers = soup.findAll(header_re)
    if not headers:
        return
    tocdiv = Tag(soup, "div", [("class", "toc")])
    parent = Tag(soup, "ul")
    parent.level = 0
    tocdiv.append(parent)
    level = 0
    previous = 0
    for header in headers:
        contents = u''.join(header.findAll(text=True))

        # In the event of an empty header, skip
        if not contents:
            continue

        # Convert html entities to avoid ugly header ids
        aid = unicode(
            BeautifulSoup(contents,
                          convertEntities=BeautifulSoup.XML_ENTITIES))
        # Prefix with PREFIX_ to avoid ID conflict with the rest of the page
        aid = u'%s_%s' % (prefix, aid.replace(" ", "_").lower())
        # Convert down to ascii replacing special characters with hex
        aid = str(title_re.sub(lambda c: '.%X' % ord(c.group()), aid))

        # Check to see if a tag with the same ID exists
        id_num = header_ids[aid] + 1
        header_ids[aid] += 1
        # Only start numbering ids with the second instance of an id
        if id_num > 1:
            aid = '%s%d' % (aid, id_num)

        header['id'] = aid

        li = Tag(soup, "li", [("class", aid)])
        a = Tag(soup, "a", [("href", "#%s" % aid)])
        a.string = contents
        li.append(a)

        thislevel = int(header.name[-1])

        if previous and thislevel > previous:
            newul = Tag(soup, "ul")
            newul.level = thislevel
            parent.append(newul)
            parent = newul
            level += 1
        elif level and thislevel < previous:
            while level and parent.level > thislevel:
                parent = parent.findParent("ul")
                level -= 1

        previous = thislevel
        parent.append(li)

    return tocdiv
Example #13
0
def AllCategories(request):
	print 'allcat'
	x = BeautifulSoup()
	#root = Tag(x,'ul', [('class', "tree"), ( 'id', "tree")])
	#x.insert(0,root)
	AllCategories = RECategory.objects.filter(parent__isnull=True).order_by('-number')
	
	AllAnswered = {}
    #в logs добавляем только самые поздние по дате RELog
	for log in RELog.objects.filter(user=request.user).order_by('-date'):
		if not log.category_id in AllAnswered:
			AllAnswered[log.category_id] = {}
		if not log.type_log in AllAnswered[log.category_id]:
			AllAnswered[log.category_id][log.type_log] = log
	for category in AllCategories:
		print category.id
		nt = Tag(x,'li', [("id", str(category.id))])
		log = AllAnswered.get(category.id)
		rating = ''
		if log:
			log = log.get(5)
			if log :
				rating = 'Оценка: ' + str(log.rating)
		div = Tag(x,'div')
		div.string = rating
		div["class"] = "rating"
		#div["style"] = "width: 150px; float: right;"
		nt.insert(0, div)
		
		if category.is_3d:
			isDDD = "Есть";
		else:
			isDDD = "Нет";
		div = Tag(x,'div')
		div.string = isDDD 
		div["class"] = "is3d"
		#div["style"] = "margin-right: 0px;width: 110px; float: right;"
		nt.insert(0, div)
		
		div = Tag(x,'div')
		div["class"] = "demo"
		#div["style"] = "margin-right: 0px;width: 110px; float: right;"
		div.string = str(category.type_category)
		nt.insert(0, div)
		
		div = Tag(x,'div')
		div.string = category.name
		nt.insert(0, div)
		
		x.insert(0,nt)
		recurseCategories(category, nt, x, AllAnswered)
	res = x.prettify()
	#print res
	print 'endallcat'
	return res
Example #14
0
    def SetupAmazonLibrary(self):
        source_path = xbmc.translatePath(
            'special://profile/sources.xml').decode('utf-8')
        source_added = False
        source = {
            self._s.ms_mov: self._s.MOVIE_PATH,
            self._s.ms_tv: self._s.TV_SHOWS_PATH
        }

        if xbmcvfs.exists(source_path):
            srcfile = xbmcvfs.File(source_path)
            soup = BeautifulSoup(srcfile)
            srcfile.close()
        else:
            subtags = ['programs', 'video', 'music', 'pictures', 'files']
            soup = BeautifulSoup('<sources></sources>')
            root = soup.sources
            for cat in subtags:
                cat_tag = Tag(soup, cat)
                def_tag = Tag(soup, 'default')
                def_tag['pathversion'] = 1
                cat_tag.append(def_tag)
                root.append(cat_tag)

        video = soup.find("video")

        for name, path in source.items():
            path_tag = Tag(soup, "path")
            path_tag['pathversion'] = 1
            path_tag.append(path)
            source_text = soup.find(text=name)
            if not source_text:
                source_tag = Tag(soup, "source")
                name_tag = Tag(soup, "name")
                name_tag.append(name)
                source_tag.append(name_tag)
                source_tag.append(path_tag)
                video.append(source_tag)
                Log(name + ' source path added!')
                source_added = True
            else:
                source_tag = source_text.findParent('source')
                old_path = source_tag.find('path').contents[0]
                if path not in old_path:
                    source_tag.find('path').replaceWith(path_tag)
                    Log(name + ' source path changed!')
                    source_added = True

        if source_added:
            self.SaveFile(source_path, str(soup))
            self._g.dialog.ok(getString(30187), getString(30188),
                              getString(30189), getString(30190))
            if self._g.dialog.yesno(getString(30191), getString(30192)):
                xbmc.executebuiltin('RestartApp')
Example #15
0
    def parse_content(self, content, attachments, tags):

        soup = BeautifulSoup(content)
        pattern = re.compile(r'<.*?src="\?hash=(\w+?)".*?>')

        # images
        for match in soup.findAll('img'):

            filehashmatch = pattern.search(str(match))
            if filehashmatch:
                filehash = filehashmatch.group(1)
                filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None)

                if filename is not None:
                    importedname = self.import_file(filename)
                    match.replaceWith(Tag(soup, 'img', [('src', importedname)]))


        # pdfs
        for match in soup.findAll('embed', {"type": "evernote/x-pdf"}):

            filehashmatch = pattern.search(str(match))
            if filehashmatch:
                filehash = filehashmatch.group(1)
                filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None)

                if filename is not None:
                    # convert pdf -> image
                    images = pdf2image(filename)

                    # import each jpg
                    imageTags = Tag(soup, "span")
                    for image in images:
                        importedname = self.import_file(image)
                        # add new image tag
                        imageTags.insert(images.index(image), Tag(soup, 'img', [('src', importedname)]))

                    # replace embed with <img src...> for each image
                    match.replaceWith(imageTags)

        # TODO: audio
        # TODO: video


        #plugins          

        # TODO: qa-format as in Supermemo
        #for match in soup.find(string=re.compile("A:")):
        #    match['class'] = match.get('class', []) + ['Evernote2Anki-Highlight']
        


        return str(soup).decode('utf-8')
Example #16
0
def createParentUlTag(targetSoup):
    parentUlTag = Tag(targetSoup,
                      'ul',
                      attrs={
                          'class': 'xbreadcrumbs',
                          'id': 'breadcrumbs'
                      })
    topListTag = Tag(targetSoup, 'li')
    topAnchorTag = Tag(targetSoup, 'a', attrs={'href': SITE_DOMAIN})
    topAnchorTag.append(NavigableString('TOP'))
    topListTag.append(topAnchorTag)
    parentUlTag.append(topListTag)
    return parentUlTag
def generateUlTag(path, file, ulClass):
    # バンド名タグを生成
    fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, path, file])))
    text = fileSoup.find('h1').renderContents()
    ulTag = Tag(formatSoup, 'ul', attrs={'class' : ulClass})
    liTag = Tag(formatSoup, 'li')
    link = '/'.join([path, file])
    aTag = Tag(formatSoup, 'a', attrs={'href' : link})
    aTag.append(NavigableString(text))
    liTag.append(aTag)
    ulTag.append(liTag)

    return ulTag
Example #18
0
def SetupAmazonLibrary():
    common.Log('Trying to add Amazon source paths...')
    source_path = os.path.join(common.profilpath, 'sources.xml')
    source_added = False
    source = {'Amazon Movies': MOVIE_PATH, 'Amazon TV': TV_SHOWS_PATH}

    try:
        file = open(source_path)
        soup = BeautifulSoup(file)
        file.close()
    except:
        subtags = ['programs', 'video', 'music', 'pictures', 'files']
        soup = BeautifulSoup('<sources></sources>')
        root = soup.sources
        for cat in subtags:
            cat_tag = Tag(soup, cat)
            def_tag = Tag(soup, 'default')
            def_tag['pathversion'] = 1
            cat_tag.append(def_tag)
            root.append(cat_tag)

    video = soup.find("video")

    for name, path in source.items():
        path_tag = Tag(soup, "path")
        path_tag['pathversion'] = 1
        path_tag.append(path)
        source_text = soup.find(text=name)
        if not source_text:
            source_tag = Tag(soup, "source")
            name_tag = Tag(soup, "name")
            name_tag.append(name)
            source_tag.append(name_tag)
            source_tag.append(path_tag)
            video.append(source_tag)
            common.Log(name + ' source path added')
            source_added = True
        else:
            source_tag = source_text.findParent('source')
            old_path = source_tag.find('path').contents[0]
            if path not in old_path:
                source_tag.find('path').replaceWith(path_tag)
                common.Log(name + ' source path changed')
                source_added = True

    if source_added:
        SaveFile(source_path, str(soup))
        Dialog.ok(common.getString(30187), common.getString(30188),
                  common.getString(30189), common.getString(30190))
        if Dialog.yesno(common.getString(30191), common.getString(30192)):
            xbmc.executebuiltin('RestartApp')
Example #19
0
def userlist(request):
        x = BeautifulSoup()
        root = Tag(x,'root')
        x.insert(0,root)
        for u in models.Group.objects.get(name='Курсанты').user_set.all():
                root.insert(0,'\n')
                root.insert(0,Tag(x,'user',[
                        ('uid',str(u.id)),
                        ('username',u.username),
                        ('first_name',u.first_name),
                        ('last_name',u.last_name),
                        ]))
        
        return HttpResponse(x)
Example #20
0
 def CreateSelect(self):
     '''Создаем select и options'''
     select = Tag(self.soup, 'select')
     select['name'] = self.GenerateName()
     for _ in range(random.randint(3, 12)):
         option = Tag(self.soup, 'option')
         option['value'] = self.textShort
         option.string = self.textShortCap
         select.append(option)
     if self._Probability(80):
         select.option['selected'] = 'selected'
     self.AppendIds(select, 10, 30)
     self.ShuffleAttributes(select)
     return select
Example #21
0
def generate_heatmap(intensities):
    # Load the SVG map
    svg = open('counties.svg', 'r').read()
    # Load into Beautiful Soup
    soup = BeautifulSoup(svg, selfClosingTags=['defs', 'sodipodi:namedview'])
    # Find counties
    paths = soup.findAll('path')
    colors = [
        "#DEEBF7", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5",
        "#08519C", "#08306B"
    ]
    min_value = min(intensities.values())
    max_value = max(intensities.values())
    scalefactor = (len(colors) - 1) / (log(max_value + 1) - log(min_value + 1))
    # County style
    path_style = 'font-size:12px;fill-rule:nonzero;stroke:#FFFFFF;stroke-opacity:1;stroke-width:0.1;stroke-miterlimit:4;stroke-dasharray:none;stroke-linecap:butt;marker-start:none;stroke-linejoin:bevel;fill:'
    # we will append this hover tooltip after each county path
    hover_text = '''<text id="popup-%s" x="%s" y="%s" font-size="10" fill="black" visibility="hidden">%s (%s)<set attributeName="visibility" from="hidden" to="visible" begin="%s.mouseover" end="%s.mouseout"/></text>'''
    for p in paths:
        if p['id'] not in ["State_Lines", "separator"]:
            try:
                count = intensities[p['id']]
            except:
                count = 0
            x, y = (p['d'].split()[1]).split(',')
            # insert a new text tag for the county hover tooltip...
            p.parent.insert(0, Tag(soup, 'text', [("id", 'popup-' + p['id'])]))
            hover = soup.find("text", {"id": 'popup-' + p['id']})
            hover.insert(1, "%s (%s)" % (p['inkscape:label'], str(count)))
            # add attributes to that text tag...
            hover['x'] = 250
            hover['y'] = 20
            hover['font-size'] = "20"
            hover['fill'] = "black"
            hover['visibility'] = "hidden"
            hover.insert(0,
                         Tag(soup, 'set', [("begin", p['id'] + '.mouseover')]))
            set_tag = soup.find("set", {"begin": p['id'] + '.mouseover'})
            set_tag['attributeName'] = "visibility"
            set_tag['from'] = "hidden"
            set_tag['to'] = "visible"
            set_tag['end'] = p['id'] + '.mouseout'
            color_class = min(int(scalefactor * log(count + 1)),
                              len(colors) - 1)
            # color_class = int((float(len(colors)-1) * float(count - min_value)) / float(max_value - min_value))
            # if count > 0:
            #   print color_class
            color = colors[color_class]
            p['style'] = path_style + color
    print soup.prettify()
Example #22
0
def makeHTMLQuestion(fn, htmldata):
    soup = BeautifulSoup(htmldata)
    #add JS
    soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')"
    soup.find('head').insert(0, SUBMIT_JS)
    #replace forms
    forms = soup.findAll('form')
    if forms:
        for form in forms:
            if not form.has_key('method'):
                form['method'] = 'POST'
            if not form.has_key('action'):
                if testmode:
                    form[
                        'action'] = 'http://workersandbox.mturk.com/mturk/externalSubmit'
                else:
                    form[
                        'action'] = 'http://www.mturk.com/mturk/externalSubmit'
            if not form.has_key('onSubmit'):
                form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');"
            inputtag = Tag(soup, 'input')
            inputtag['type'] = 'hidden'
            inputtag['name'] = 'assignmentId'
            inputtag['id'] = 'myAssignmentId'
            inputtag['value'] = ''
            form.insert(0, inputtag)
    mainurl = uploadfile(fn, str(soup))
    for sub in soup.findAll('img'):
        # TODO
        fn = dirname(fn) + '/' + sub['src']
        uploadfile(fn)
    return ExternalQuestion(escape(mainurl), frame_height)
Example #23
0
def clean_html_style(data, element, remove_comments=True, remove_empty=True):
    """removes the style information associated with html element

    >>> t = '<!--  /* Style Definitions */ table.MsoNormalTable	{mso-style-name:"Table Normal";	mso-tstyle-rowband-size:0;	mso-tstyle-colband-size:0;	mso-style-noshow:yes;	mso-style-priority:99;	mso-style-qformat:yes;	mso-style-parent:"";	mso-padding-alt:0in 5.4pt 0in 5.4pt;	mso-para-margin-top:0in;	mso-para-margin-right:0in;	mso-para-margin-bottom:10.0pt;	mso-para-margin-left:0in;	line-height:115%;	mso-pagination:widow-orphan;	font-size:11.0pt;	font-family:"Calibri","sans-serif";	mso-ascii-font-family:Calibri;	mso-ascii-theme-font:minor-latin;	mso-hansi-font-family:Calibri;	mso-hansi-theme-font:minor-latin;} --><p>  </p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">?</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p')
    '<p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p', remove_empty=False)
    '<p> </p><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    >>> clean_html_style(t, 'p', remove_comments=False)
    '<!--  /* Style Definitions */ table.MsoNormalTable\t{mso-style-name:"Table Normal";\tmso-tstyle-rowband-size:0;\tmso-tstyle-colband-size:0;\tmso-style-noshow:yes;\tmso-style-priority:99;\tmso-style-qformat:yes;\tmso-style-parent:"";\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\tmso-para-margin-top:0in;\tmso-para-margin-right:0in;\tmso-para-margin-bottom:10.0pt;\tmso-para-margin-left:0in;\tline-height:115%;\tmso-pagination:widow-orphan;\tfont-size:11.0pt;\tfont-family:"Calibri","sans-serif";\tmso-ascii-font-family:Calibri;\tmso-ascii-theme-font:minor-latin;\tmso-hansi-font-family:Calibri;\tmso-hansi-theme-font:minor-latin;} --><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>'
    """
    try:
        soup = BeautifulSoup(data)
    except:
        soup = BeautifulSoup(data)
    # remove all comments in this html block
    if remove_comments:
        comments = soup.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]

    # remove all occurences of tags like sup, script
    [i.extract() for i in soup.findAll(re.compile('sup|script'))]

    # find all occurences of the "element" tag
    for i in soup.findAll(element):
        text = i.renderContents().strip()
        if text:
            new_tag = Tag(soup, element)
            new_tag.insert(0, text)
            i.replaceWith(new_tag)
        elif remove_empty:
            i.extract()
    return smart_unicode(soup.renderContents())
Example #24
0
 def _linkify_headings(self, soup):
     md_el = soup.find('div', 'md')
     for heading in md_el.findAll(['h1', 'h2', 'h3'], recursive=False):
         heading_a = Tag(soup, "a", [('href', '#%s' % heading['id'])])
         heading_a.contents = heading.contents
         heading.contents = []
         heading.append(heading_a)
Example #25
0
def initServerInfoBase(fileName):
    """
    @description: Intializes soup for the Beautiful soup parser. Reads the exisitng Data from the fileName paramter.
    @todo:None
    @param xml: String, Name of file to be loaded in soup.
    @return: Boolean, True if a successful, else False
    """
    if os.path.exists(fileName):
        try:
            f = open(fileName, "r")
        except:
            return None, False
        xml = f.read()
        f.close()
        soup = BeautifulStoneSoup(xml)
        serverinfolist = soup.findAll("serverinfo")
    else:
        serverinfolist = []
        soup = BeautifulSoup()
        xml = "null"

    if len(serverinfolist) == 0:
        serverinfo = Tag(soup, "serverinfo")
        soup.insert(0, serverinfo)

    return soup, True
Example #26
0
def content_absolute_links(content, image=None):
    from django.contrib.sites.models import Site
    current_site = Site.objects.get(pk=settings.SITE_ID)

    def abs_url(url):

        parsed = urlparse.urlparse(url)
        if parsed.netloc == parsed.scheme == '':
            url = urlparse.urljoin('http://{0}'.format(current_site.domain),
                                   url)
        return url

    soup = BeautifulSoup(content)

    if image:
        img = Tag(soup, 'img', [('src', image)])
        soup.insert(0, img)

    for link in soup.findAll('a'):
        link['href'] = abs_url(link['href'])

    for link in soup.findAll('img'):
        link['src'] = abs_url(link['src'])

    return unicode(soup)
Example #27
0
    def get_mobile_content(self, obj):
        if obj.mobile_content:
            content = obj.mobile_content
        else:
            content = obj.content

        if not self.host_det:  # apps only
            content = content.replace("\n<br />\n", "\n")
        elif self.host_det == "android":
            content = content.replace("\n<br />\n", "\n")
            soup = BeautifulSoup(content)
            # if soup.findAll('iframe'):
            #     gh = soup.findAll('iframe')[0]['src']
            #     hh = soup.findAll('iframe')
            for p in soup.findAll("iframe"):
                if "youtube" in p['src']:
                    newTag = Tag(soup, "a")
                    newTag.attrs.append(("src", p.get('src')))
                    p.append(newTag)
            content = unicode(soup)
        if obj.source is not None and obj.source != '':
            content = content + "<p>Sources: " + obj.source.replace(
                "<p>", "").replace("</p>", "") + "</p>"
        else:
            content = content

        content = obj.get_modified_content(content, content_type='mobile')
        return content
Example #28
0
def select_calendar(month=None, year=None):
    now = datetime.now()
    day = now.day
    cal = calendar.HTMLCalendar()
    cal.setfirstweekday(6)
    month_table = cal.formatmonth(year, month)
    soup = BeautifulSoup(month_table)
    outfile = open("myHTML.html", 'w')

    for data in soup.findAll('td'):
        if data['class'] != "noday":
            days = data.findAll(text=True)
            for oneday in days:
                day = NavigableString(oneday)
                oneday.extract()
                addatag = Tag(soup, 'input')
                addatag['type'] = "submit"
                addatag['name'] = "meetingday"
                addatag['value'] = day
                data.insert(0, addatag)

    outfile.write(soup.prettify())
    outfile.close()
    infile = open("myHTML.html", 'r')
    calfile = ""
    for line in infile:
        calfile = calfile + line
    infile.close()

    return calfile
Example #29
0
def save():
	json_data = request.json
	status = False
	data={}
	with open(ret_url(json_data["doc"],"/papers"), "r+") as inf:
		txt = inf.read()
		soup = BeautifulSoup(txt)
		#Solo se e' una review faccio queste modifiche, altrimenti se e' una decisione lo inserisce direttamente nell'head
		if json_data["type"] == "review": 
			#Controllo se lo script esiste o meno, se esiste lo elimino
			for script in soup.findAll("script",{"type":"application/ld+json"}):
				data = json.loads(script.text.strip())
				if data[0]["@type"] == "review":
					if data[0]["article"]["eval"]["author"] == "mailto:"+json_data["author"]:
						script.extract()
						break
			#Rimuovo il contenuto del Body e lo riscrivo
			for section in soup.findAll("section"):
				section.extract()
			for section in json_data["sections"]:
				beauty = BeautifulSoup(section)
				soup.body.insert(len(soup.body.contents), beauty)
		#Creo lo script e lo inserisco
		new = Tag(soup, "script")
		new.attrs.append(("type", "application/ld+json"))
		new.string = json.dumps(json_data["script"])
		soup.head.insert(len(soup.head.contents), new)
		#Salvo il file
		html = soup.prettify("utf_8")
		inf.seek(0)
		inf.write(html)
		inf.truncate()
		inf.close()
	status=True 
	return jsonify({"result": status})
Example #30
0
    def _set(self, topic, key, value, topic_attr=None):
        """Set key and value at topic
        
        :return: success status
        :rtype: bool"""

        # In case it is an empty document
        if not unicode(self._soup).strip().startswith("<?xml"):
            self._soup.insert(0, NavigableString(self.HEADER))

        # In case settings root is not defined
        settings = self._soup.find(self.root)
        if settings is None:
            self._soup.insert(1, Tag(self._soup, self.root))
            settings = self._soup.find(self.root)

        # Add Topic
        topic_tag = self._set_element(settings, topic.lower(), attr=topic_attr)

        if topic_tag is None:
            return False

        # Add key and value
        key_tag = self._set_element(topic_tag, key.lower(), escape(value))
        # Add "" since XML may introduce whitespaces.
        #key_tag = self._set_element(topic_tag, key, '"{0}"'.format(value))

        return key_tag is not None