Python Tag.insert Beispiele, BeautifulSoup.Tag.insert Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: svgcompile.py Projekt: grodtron/texttogcode

   def outputData(self, outfile):

      outSoup = BeautifulStoneSoup("", selfClosingTags=["path"])
      outRoot = Tag(outSoup, "svg")
      outRoot["xmlns"] = "http://www.w3.org/2000/svg"
      outRoot["width"] = self.width
      outRoot["height"] = self.height
      outRoot["version"] = 1.1

      outSoup.insert(0, outRoot)


      for char in reversed(self._soup.findAll("char")):
         path = Tag(outSoup, "path")
         path["d"] = char["d"]
         path["style"] = self.style
         outRoot.insert(0, path)


      svg_header = "<?xml version=\"1.0\" standalone=\"no\"?>\n"
      svg_header += "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\""
      svg_header += " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n"

      self.scaleAndGridAlphabet(outSoup)

      outfile = open(outfile, "w")

      outfile.write(svg_header +  outSoup.prettify())

      outfile.close()

Beispiel #2

0

Datei anzeigen

Datei: models.py Projekt: pombredanne/xscheduling

 def _prepare_soup_put_assign(self):
   soup, root_tag = self._prepare_soup_root_tag()
   try:
     id_tag = Tag(soup, 'ID')
     id_tag.insert(0, NavigableString(self.id))
     root_tag.insert(0, id_tag)
   except AttributeError:
     raise ValueError("You must have ID for PUT.")
   
   i = 1
   old_list = [x.id for x in self.xml_object.assigned]
   new_list = [x.id for x in self.assigned]
   added = [x for x in new_list if x not in old_list]
   removed = [x for x in old_list if x not in new_list]
   for staff_id in added:
     add_tag = Tag(soup, 'add', [('id', '%s' % staff_id),])
     add_tag.isSelfClosing=True
     root_tag.insert(i, add_tag)
     i = i+1
   for staff_id in removed:
     remove_tag = Tag(soup, 'remove', [('id', '%s' % staff_id),])
     remove_tag.isSelfClosing=True
     root_tag.insert(i, remove_tag)
     i = i+1
   return soup

Beispiel #3

0

Datei anzeigen

Datei: vlive_tags.py Projekt: praekelt/ummeli

def sanitize_html(value):
    from BeautifulSoup import BeautifulSoup, Comment, Tag

    # FIXME: 'None' should never be saved as text
    if value is None:
        return ""

    # allowed tags for a Vodafone Live <CONTAINER type="data" />
    # this doubles up as a translation table. CKEditor does new-ish
    # HTML than Vodafone Live will accept. We have to translate 'em' back
    # to 'i', and 'strong' back to 'b'.
    #
    # NOTE: Order is important since <strong>'s can be inside <p>'s.
    tags = (
        ("em", "i"),  # when creating them in the editor they're EMs
        ("strong", "b"),
        ("i", "i"),  # when loading them as I's the editor leaves them
        ("b", "b"),  # we keep them here to prevent them from being removed
        ("u", "u"),
        ("br", "br"),
        ("p", "p"),
    )
    valid_tags = [tag for tag, replacement_tag in tags]
    soup = BeautifulSoup(value)

    # remove all comments from the HTML
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # hide all tags that aren't in the allowed list, but keep
    # their contents
    for tag in soup.findAll(True):
        # Vodafone Live allows for no tag attributes
        tag.attrs = []
        if tag.name not in valid_tags:
            tag.hidden = True

    # replace tags with Vlive equivelants
    for element, replacement_element in tags:
        if element is not replacement_element:
            for tag in soup.findAll(element):
                replacement_tag = Tag(soup, replacement_element)
                replacement_tag.insert(0, tag.text)
                tag.replaceWith(replacement_tag)

    xml = soup.renderContents().decode("utf8")
    fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"})

    return (
        fragment.replace("&nbsp;", " ")
        .replace("&rsquo;", "'")
        .replace("&lsquo;", "'")
        .replace("&quot;", '"')
        .replace("&ldquo;", '"')
        .replace("&rdquo;", '"')
        .replace("&bull;", "- ")
        .replace("&eacute;", "e")
        .replace("&Eacute;", "E")
        .replace("&ndash;", "-")
    )

Beispiel #4

0

Datei anzeigen

Datei: databaseHandler.py Projekt: DavidFalk/xbmc.plugin.program.download-next-ep

def addEpisode(xbmcid, scraperid, snr,enr, title, airdate):
	f = getDatabase("r")
	soup = BeautifulSoup(f.read())
	f.close()
	serie = soup.find(scraperid = scraperid)
	#TODO check inconsistency
	if serie == None :
		return False
	season = serie.find(seasonnr = snr)
	if season == None:
		tag = Tag(soup, "season")
		tag.attrs.append(('seasonnr', snr))
		serie.append(tag)
		season = serie.find(seasonnr = snr)
	if season == None:
		util.msg(localize(50000), localize(50004))
		return False
	episode = season.find(episodenr = enr)
	if episode == None:
		episodetag = Tag(soup, "episode")
		episodetag.attrs.append(('episodenr', enr))
		titletag = Tag(soup, "title")
		titletag.insert(0,title)
		episodetag.append(titletag)
		airdatetag = Tag(soup, "airdate")
		airdatetag.insert(0,airdate)
		episodetag.append(airdatetag)
		season.append(episodetag)
		
		f = getDatabase("w")
		f.write(soup.prettify())
		f.close()
	#else:
		#check consistency
	return True

Beispiel #5

0

Datei anzeigen

Datei: ipernity.py Projekt: sedden/strategchen

def do_iperimage(value):

    """detects iPernity static urls and creates clickable thumbnail for it"""

    soup = BeautifulSoup(value)
    iprl = re.compile("^(http://\w+\.ipernity\.com/\d+/\d+/\d+/\d+\.\w+\.)(75|100|240|500|560)(\.jpg)$")
    iprl_thumb = "500"
    iprl_zoom = "560"

    for img in soup.findAll("img", src=iprl):

        match = iprl.match(img["src"])
        try:
            thumb = Tag(soup, "img")
            thumb["alt"] = img["title"]
            thumb["src"] = match.group(1) + iprl_thumb + match.group(3)

            link = Tag(soup, "a")
            link["href"] = match.group(1) + iprl_zoom + match.group(3)
            link["rel"] = "lightbox"
            link["title"] = img["title"]
            link.insert(0, thumb)

            img.replaceWith(link)
        except:
            pass

    return unicode(soup)

Beispiel #6

0

Datei anzeigen

Datei: GoogleBookmarks.py Projekt: HoffmannP/BASH-scripts-an-stuffalike

def htmlizeTree(tree, base):
    from BeautifulSoup import Tag, NavigableString
    import cgi
    elements = []
    for branch in tree:
        if branch.has_key("href"):
            el = Tag(base, "A")
            for attrib in ("href", "add_date", "icon"):
                el[attrib] = branch[attrib]
        else:
            el = Tag(base, "H3")
        try:
            el.insert(0, NavigableString(branch["name"]))
        except:
            el.insert(0, NavigableString("[can not convert]"))
            print "can not convert ", branch["name"]
        dt = Tag(base, "DT")
        dt.insert(0, el)
        elements.append(dt)
        if branch.has_key("tree"):
            elements.append(htmlizeTree(branch["tree"], base))
    dl = Tag(base, "DL")
    for i, element in enumerate(elements):
        dl.insert(i, element)
    dd = Tag(base, "DD")
    dd.insert(0, dl)
    return dd

Beispiel #7

0

Datei anzeigen

Datei: scraper.py Projekt: kristjanjansen/environmental_notices_fusion

 def geo_term_extract(self, desc):
     data = values ={
              'maxRows':'1',
              'fuzzy':'1',
              'country':'EE',
              'featureClass':'P',
              'operator':'OR',
              'username':self.geonames_user,
              'q':desc.encode('utf-8')}
     data=urllib.urlencode(values)
 
     link = u"http://api.geonames.org/search"
     xmldata = urllib.urlopen(link, data)
     soup = BeautifulSoup(xmldata)
 #   print soup.prettify()
     lng = '0'
     lat = '0'
     if len(soup.findAll("lat")) > 0:
         lng = soup.findAll("lng")[0].text
         lat = soup.findAll("lat")[0].text
         lat_f = float(lat)
         lng_f = float(lng)
         lat = '%.5f' % ((lat_f * 10000 + random.uniform(1,80))/10000)
         lng = '%.5f' % ((lng_f * 10000 + random.uniform(1,80))/10000)
     
     soup2 = BeautifulSoup()
     tag1 = Tag(soup2, "Point")
     tag2 = Tag(soup2, "coordinates")
     soup2.insert(0, tag1)
     tag1.insert(0, tag2)
     text = NavigableString(lng + "," + lat)
     tag2.insert(0, text)
 #   print soup2
     result = (soup2.__str__()).encode("utf-8")
     return [result, lat, lng]

Beispiel #8

0

Datei anzeigen

Datei: Article_Builder.py Projekt: iamutkarshtiwari/infoslicer

def _tag_generator(soup, name, attrs=[], contents=None):
    if attrs != []:
        new_tag = Tag(soup, name, attrs)
    else:
        new_tag = Tag(soup, name)
    if contents != None:
        new_tag.insert(0, contents)
    return new_tag

Beispiel #9

0

Datei anzeigen

Datei: blog_filters.py Projekt: tunix/raptiye-django

def code_colorizer(entry):
    """
    Uses BeautifulSoup to find and parse the code in the entry
    that will be colorized and changes it according to the syntax
    specs using pygments.

    The HTML code should include the colorized code wrapped into a
    div which has language (e.g. python) as id and "code" as class
    attributes.

    Best part of using a filter is that we don't have to change the
    real post containing the code. The worst part is that we have to
    search for the code layer in each post.

    """

    if settings.COLORIZE_CODE:
        try:
            from BeautifulSoup import BeautifulSoup, Tag
            from pygments import highlight
            from pygments.lexers import get_lexer_by_name
            from pygments.formatters import HtmlFormatter
        except ImportError:
            return entry

        try:
            parser = BeautifulSoup(entry, convertEntities=BeautifulSoup.ALL_ENTITIES)
        except HTMLParser.HTMLParseError:
            return entry

        # searching for code blocks in the blog entry
        code_blocks = parser.findAll("div", attrs={"class": "code"})

        if len(code_blocks) > 0:
            for block in code_blocks:
                # if the code block's wrapper div doesn't have an id
                # attribute don't colorize the code
                if "id" in block.attrMap:
                    language = block.attrMap["id"]
                else:
                    continue

                # finding the exact place of the code
                layer = block.div if block.div else block
                # removing any html tags inside the code block
                [tag.extract() for tag in layer.findAll()]
                # getting the original code in the block
                code = "".join(layer.contents)
                # colorizing the code
                lexer = get_lexer_by_name(language)
                formatter = HtmlFormatter(linenos="table", style="tango", cssclass="code")
                colorized_code = Tag(parser, "div") if block.div else Tag(parser, "div", attrs=(("id", language), ("class", "code")))
                colorized_code.insert(0, highlight(code, lexer, formatter))
                layer.replaceWith(colorized_code)

            return parser.renderContents()

    return entry

Beispiel #10

0

Datei anzeigen

Datei: ans.py Projekt: asukafag/Talho

def main(bot, args):
	'''Ответить слушателю. Параметры: <user_id> <message>
Если в качестве user_id указать восклицательный знак, сообщение будет выглядеть как объявление.'''
	syl = { '0' : 'be', '1' : 'sa', '2' : 'ko', '3' : 'pa', '4' : 're', '5' : 'du', '6' : 'ma', '7' : 'ne', '8' : 'wa', '9' : 'si', 'a' : 'to', 'b' : 'za', 'c' : 'mi', 'd' : 'ka', 'e' : 'ga', 'f' : 'no' }
	salt = bot.settings["ans_salt"]
	message_limit = 250
	userpost = ""
	if len(args) < 2:
		return
	blacklisting = False
	if args[0] != "!":
		if args[0] == "?":
			blacklisting = True
			del args[0]
		if len(args[0]) != 12:
			return _("incorrect name entered, should be 12 symbols.")
		check = md5()
		check.update(args[0][:8].encode('utf-8') + salt)
		if check.hexdigest()[:4] != args[0][8:12]:
			return _("incorrect name entered (checksum invalid).")
	
		if blacklisting:
			bot.blacklist.append(args[0])
			return _("%s was added to blacklist.") % args[0]

		to = ">>" + args[0]
		if args[0] in bot.usersposts:
			userpost = "<span class=\"userpost\">&gt; " + escape(bot.usersposts[args[0]]) + "</span><br/>"
	else:
		to = "!"
        message = " ".join(args[1:])
	if len(message) > message_limit:
		return _("too long answer, should be less than %d symbols, you entered %d symbols.") % (message_limit, len(message))
        soup = BeautifulSoup(open(bot.settings["ans_file"], "r"))
	posts = soup.findAll('p')
	new_post = Tag(soup, 'p')
	user_id = Tag(soup, 'span', [('id', 'user_id')])
	if to != "!":
		user_id.insert(0, escape(to))
	else:
		user_id.insert(0, "<b>&gt;&gt;ОБЪЯВЛЕНИЕ&lt;&lt;</b>")
	new_post.insert(0, '[' + datetime.datetime.strftime(datetime.datetime.now(), "%H:%M:%S") + ']')
	new_post.insert(1, user_id)
	new_post.insert(2, userpost + escape(message))
	if len(posts) > 0:
		posts[0].parent.insert(2, new_post)
	else:
		soup.find('h1').parent.insert(1, new_post)
	if len(posts) > 9:

		posts[len(posts) - 1].extract()

	f = open(bot.settings["ans_file"], "w")
	f.write(soup.prettify())
	f.close()
        
        return _("sent.")

Beispiel #11

0

Datei anzeigen

Datei: views.py Projekt: tylerlane/dataleader

def neighborhood_kml(request,neighborhood):
    neighborhood = Neighborhood.objects.get(name=neighborhood)
    soup = BeautifulSoup(neighborhood.geom.kml)
    tag = Tag(soup, "extrude")
    soup.polygon.insert(0, tag )
    text = "1"
    tag.insert(0, text)
    xml = str(soup )
    return render_to_response("restaurants/kml_template.html",{'neighborhood': neighborhood,"xml": xml}, context_instance=RequestContext(request))

Beispiel #12

0

Datei anzeigen

Datei: inliner.py Projekt: enml/lianpeng

def replaceJavascript(base_url, soup):
    for js in soup.findAll('script', {'src': re.compile('.+')}):
        try:
            real_js = get_content(resolve_path(base_url, js['src']))
            real_js = real_js.replace('</', 'u003c/')
            js_tag = Tag(soup, 'script')
            js_tag.insert(0, NavigableString(real_js))
            js.replaceWith(js_tag)
        except Exception,e:
            print 'failed to load javascript from %s' % js['src']
            print e

Beispiel #13

0

Datei anzeigen

Datei: helpers.py Projekt: auvipy/flowgram.com

def build_flat_xml_object(name, fields):
    from BeautifulSoup import BeautifulStoneSoup, NavigableString, Tag
    from django.utils.html import escape
    soup = BeautifulStoneSoup()
    obj = Tag(soup, name)
    soup.insert(0, obj)
    for name, value in fields:
        tag = Tag(soup, name)
        tag.insert(0, NavigableString(escape(value)))
        obj.insert(0, tag)
    return unicode(soup)

Beispiel #14

0

Datei anzeigen

Datei: doxy_cleanup.py Projekt: ImtRuby/chromium

  def FixTableHeadings(self):
    '''Fixes the doxygen table headings.

    This includes:
      - Using bare <h2> title row instead of row embedded in <tr><td> in table
      - Putting the "name" attribute into the "id" attribute of the <tr> tag.
      - Splitting up tables into multiple separate tables if a table
        heading appears in the middle of a table.

    For example, this html:
     <table>
      <tr><td colspan="2"><h2><a name="pub-attribs"></a>
      Data Fields List</h2></td></tr>
      ...
     </table>

    would be converted to this:
     <h2>Data Fields List</h2>
     <table>
      ...
     </table>
    '''

    table_headers = []
    for tag in self.soup.findAll('tr'):
      if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']:
        #tag['id'] = tag.td.h2.a['name']
        tag.string = tag.td.h2.a.next
        tag.name = 'h2'
        table_headers.append(tag)

    # reverse the list so that earlier tags don't delete later tags
    table_headers.reverse()
    # Split up tables that have multiple table header (th) rows
    for tag in table_headers:
      print "Header tag: %s is %s" % (tag.name, tag.string.strip())
      # Is this a heading in the middle of a table?
      if tag.findPreviousSibling('tr') and tag.parent.name == 'table':
        print "Splitting Table named %s" % tag.string.strip()
        table = tag.parent
        table_parent = table.parent
        table_index = table_parent.contents.index(table)
        new_table = Tag(self.soup, name='table', attrs=table.attrs)
        table_parent.insert(table_index + 1, new_table)
        tag_index = table.contents.index(tag)
        for index, row in enumerate(table.contents[tag_index:]):
          new_table.insert(index, row)
      # Now move the <h2> tag to be in front of the <table> tag
      assert tag.parent.name == 'table'
      table = tag.parent
      table_parent = table.parent
      table_index = table_parent.contents.index(table)
      table_parent.insert(table_index, tag)

Beispiel #15

0

Datei anzeigen

Datei: Evernote2AnkiMac.py Projekt: rbuc/Evernote2AnkiMac

    def parse_content(self, content, attachments, tags):

        soup = BeautifulSoup(content)
        pattern = re.compile(r'<.*?src="\?hash=(\w+?)".*?>')

        # images
        for match in soup.findAll('img'):

            filehashmatch = pattern.search(str(match))
            if filehashmatch:
                filehash = filehashmatch.group(1)
                filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None)

                if filename is not None:
                    importedname = self.import_file(filename)
                    match.replaceWith(Tag(soup, 'img', [('src', importedname)]))


        # pdfs
        for match in soup.findAll('embed', {"type": "evernote/x-pdf"}):

            filehashmatch = pattern.search(str(match))
            if filehashmatch:
                filehash = filehashmatch.group(1)
                filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None)

                if filename is not None:
                    # convert pdf -> image
                    images = pdf2image(filename)

                    # import each jpg
                    imageTags = Tag(soup, "span")
                    for image in images:
                        importedname = self.import_file(image)
                        # add new image tag
                        imageTags.insert(images.index(image), Tag(soup, 'img', [('src', importedname)]))

                    # replace embed with <img src...> for each image
                    match.replaceWith(imageTags)

        # TODO: audio
        # TODO: video


        #plugins          

        # TODO: qa-format as in Supermemo
        #for match in soup.find(string=re.compile("A:")):
        #    match['class'] = match.get('class', []) + ['Evernote2Anki-Highlight']
        


        return str(soup).decode('utf-8')

Beispiel #16

0

Datei anzeigen

Datei: inlines.py Projekt: syncopated/97bottles

def do_inlines(value, render_content=False):
   """ 
   Processes inlines for a string of text (such as a blog post). If 
   render_content is True, will return the full blog post HTML, with inlines
   rendered through their templates. If rendered_content is false, will return
   a list of inline objects.
   """
   from BeautifulSoup import BeautifulStoneSoup, Tag
   
   # Parse the entry content, passing BeautifulStoneSoup our inline tag (plus the regular HTML ones) as self-closing.
   content = BeautifulStoneSoup(value, selfClosingTags=['br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base','inline'])
   
   # Set up the inline_objects list.
   inline_objects = []

   # If rendered_content is true, then we want the entire rendered HTML as a result.
   if render_content == True:
     
    # Look for 'inline' elements, and itterate through them.
    for inline in content.findAll('inline'):
      # Get the html from the template for each inline element

      html_dict = process_inline(inline, html=True)
      try:
        inline_object_details = html_dict["inline_object_details"]
      except:
        return html_dict
      # Add the details of this inline to the inline_objects array.
      inline_object_details = html_dict["inline_object_details"]
      inline_html = html_dict["inline_html"]
      inline_objects.append(html_dict["inline_object_details"])
      # Add a new div tag to the tree, and then replace our inline tag with it, instead.
      inline_tag = Tag(content, "div", [("id", inline_object_details['html_id']),("class", inline_object_details['html_class'])])
      inline_tag.insert(0, inline_html)
      inline.replaceWith(inline_tag)

      
    # Render out the final HTML for the blog post.
    final_html = content.renderContents()
    return final_html
       
   # If render_content is false, then we just want a list of the objects themselves.
   else:
      
      # Look for 'inline' elements, and itterate through them.
      for inline in content.findAll('inline'):
        # Add the details of this inline to the inline_objects list.
        processed_inline = process_inline(inline, html=False)
        inline_objects.append(processed_inline['object'])
      
      # Return the final list of inline objects.
      return inline_objects

Beispiel #17

0

Datei anzeigen

Datei: utils.py Projekt: nlativy/scritti

def markdown_pygment(txt, linenos="table", stripimg=False):
    """
    Convert Markdown text to Pygmentized HTML

    """
    html = markdown(txt, safe_mode='replace')
    soup = BeautifulSoup(html)
    formatter = HtmlFormatter(cssclass='source', linenos=linenos)
    dirty = False

    for img in soup.findAll('img'):
        dirty = True
        if stripimg:
            img.replaceWith("[IMAGES NOT ALLOWED IN COMMENTS]")
        else:
            # learn BeautifulSoup and clean this up
            img['class'] = 'postimg'
            toWrap = img
            p = img.parent
            if p.name == 'a':
                # This is a link, wrap the div around the parent of the link
                toWrap = p
                p = p.parent

            imgDiv = Tag(soup, "div", [("class", "image-wrapper")])
            imgDiv.insert(0, toWrap)
            p.insert(0, imgDiv)

            # Remove surrounding <p></p> to make HTML valid
            # This is all rather horrible but works with my blog
            # posts - need to fix so that the code is correct in all cases
            # easiest way may be to modify markdown itself
            para = imgDiv.parent
            if para.name == 'p':
                para.replaceWith(imgDiv)

    for tag in soup.findAll('pre'):
        if tag.code:
            txt = tag.code.renderContents()
            if txt.startswith('pygments:'):
                lexer_name, txt = txt.split('\n', 1)
                lexer_name = lexer_name.split(':')[1]
                txt = _replace_html_entities(txt)
                if lexer_name in _lexer_names:
                    lexer = get_lexer_by_name(lexer_name, stripnl=True, encoding='UTF-8')
                    tag.replaceWith(highlight(txt, lexer, formatter))
                    dirty = True
    if dirty:
        html = unicode(soup)

    return html

Beispiel #18

0

Datei anzeigen

Datei: views.py Projekt: Godrik-xp/myTZM

def userlist(request):
        x = BeautifulSoup()
        root = Tag(x,'root')
        x.insert(0,root)
        for u in models.Group.objects.get(name='Курсанты').user_set.all():
                root.insert(0,'\n')
                root.insert(0,Tag(x,'user',[
                        ('uid',str(u.id)),
                        ('username',u.username),
                        ('first_name',u.first_name),
                        ('last_name',u.last_name),
                        ]))
        
        return HttpResponse(x)

Beispiel #19

0

Datei anzeigen

Datei: eurlex.py Projekt: stef/le-n-x

def anchorArticles(txt):
    # find all textnodes starting with Article, wrapping this in a named <a> and prepending a hoverable link to this anchor
    aregex=re.compile('^\s*Article\s+[0-9][0-9.,]*', re.I)
    nsoup = BeautifulSoup(txt)
    node=nsoup.find(text=aregex)
    while node:
        nodeidx=node.parent.contents.index(node)
        match=str(re.match(aregex,node).group())
        # create named <a>
        name=match.replace(' ','_')
        a=Tag(nsoup,'a',[('name',name)])
        a.insert(0,match)
        # create a link that is displayed if the <a> is hovered
        link=Tag(nsoup,'a', [('class',"anchorLink"), ('href','#'+name)])
        link.insert(0,"#")
        # create a container for the a and the link
        hover=Tag(nsoup,'span',[('class','hover')])
        hover.insert(0,a)
        hover.insert(0,link)
        node.parent.insert(nodeidx,hover)
        # cut the newly wrapped from the original node.
        newNode=NavigableString(node[len(match):])
        node.replaceWith(newNode)
        node=newNode.findNext(text=aregex)
    return str(nsoup)

Beispiel #20

0

Datei anzeigen

Datei: models.py Projekt: bmispelon/django-mnemosyne

    def cache_text_to_html(self):
        from markdown import markdown
        from BeautifulSoup import BeautifulSoup, Tag, NavigableString
        from mnemosyne.utils import purge_html

        text_html = markdown(self.text)
        soup = purge_html(BeautifulSoup(text_html))

        i_remember_tag = Tag(soup, "span")
        i_remember_tag.insert(0, NavigableString(self.i_remember() + " "))
        i_remember_tag["class"] = "iremember"
        soup.first().insert(0, i_remember_tag)

        return soup.decode()

Beispiel #21

0

Datei anzeigen

Datei: oshaview.py Projekt: EU-OSHA/osha.theme

    def collage2table(self, data, u=False):
        """
        Takes an html page generated from collage in the oshmail format and
        converts some divs to a table layout. The collage builds a system
        of nested divs for rows and columns. What we need is a table with
        one row, with two columns: 1.1 and 1.2.
        """
        soup = BeautifulSoup(data)

        # find the real content cells
        cell_11, cell_12 = soup.findAll(attrs={"class": "collage-column"}, limit=2)

        # create a table
        table = Tag(soup, "table", [("id", "collage-table")])

        row1 = Tag(soup, "tr")
        row2 = Tag(soup, "tr")

        col1 = Tag(soup, "td", [("valign", "top"), ("id", "collage-table-cell1"), ("width", "590")])
        col2 = Tag(soup, "td", [("valign", "top"), ("id", "collage-table-cell2"), ("width", "200")])

        col1.insert(0, cell_11)
        col2.insert(0, cell_12)

        row1.insert(0, col1)
        row1.insert(1, col2)

        table.insert(0, row1)

        if u:
            return unicode(table)
        return str(table)

Beispiel #22

0

Datei anzeigen

Datei: sfusd_demo.py Projekt: schasins/school-program-scraping

 def highlightedNode(self, target_node, yes_phrase, parent_soup):
     content = str(target_node)
     text = content.lower()
     j = text.find(yes_phrase)
     tag = Tag(parent_soup, "div", [("style", "background-color:#FF8A0D")])
     if yes_phrase:
        tag.append(content[:j])
        bold = Tag(parent_soup, "b")
        bold.insert(0,content[j:(j + len(yes_phrase))])
        tag.append(bold)
        tag.append(content[(j + len(yes_phrase)):])
     else:
        tag.append(content)
     return tag

Beispiel #23

0

Datei anzeigen

Datei: views.py Projekt: Godrik-xp/myTZM

def ConvertToTestHtml(quest):
	types = quest.type
	titles = quest.text
	quests_ids = [quest.id]
	answers = RETestAnswer.objects.filter(question__id__in=quests_ids)
	newbs = BeautifulSoup()
	pNode = Tag(newbs, 'p')
	newbs.insert(0,pNode)
	if quest.img:
		print 'Image!!!'
		print quest.img.url
		imageNode = Tag(newbs, 'image', [('src', quest.img.url)])
		newbs.insert(0,imageNode)
	TitleNode = Tag(newbs, 'p')
	TitleNode.string = titles
	newbs.insert(0,TitleNode)
	i = 0
	if types != 1:
		for answer in answers:
			radioname = 'ans' + str(i)
			nt = Tag(newbs,'input', [('type', 'radio'), ('type', radioname), ('name', 'answerradio'), ('value', str(answer.is_correct))])
			nt.string = answer.name
			pNode.insert(len(pNode.contents), nt)
			pNode.insert(len(pNode.contents), Tag(newbs, 'br'))
	else:
		for answer in answers:
			radioname = 'ans' + str(i)
			nt = Tag(newbs,'input', [('type', 'text'), ('name', 'answertext'),('ans', answer.name)])
			pNode.insert(len(pNode.contents), nt)
			pNode.insert(len(pNode.contents), Tag(newbs, 'br'))
	return newbs.prettify()

Beispiel #24

0

Datei anzeigen

Datei: models.py Projekt: pombredanne/xscheduling

  def delete(self):
    soup = BeautifulSoup()
    client_tag = Tag(soup, 'Client')
    soup.insert(0, client_tag)
    try:
      id_tag = Tag(soup, 'ID')
      id_tag.insert(0, NavigableString('%d' % self.id))
      client_tag.insert(0, id_tag)
    except AttributeError:
      raise ValueError("You must have id for delete operation.")  

    response = rest_client.Client("").POST(self.delete_url, str(soup))
    soup = BeautifulStoneSoup(response.content)
    if soup.status and soup.status.contents[0].lower() == 'error':
      raise ResponseStatusError(soup.errordescription.contents[0])

Beispiel #25

0

Datei anzeigen

Datei: models.py Projekt: hashier/NewsBlur

 def sanitize_story(self, story_content):
     soup = BeautifulSoup(story_content.strip())
     fqdn = Site.objects.get_current().domain
     
     for iframe in soup("iframe"):
         url = dict(iframe.attrs).get('src', "")
         youtube_id = self.extract_youtube_id(url)
         if youtube_id:
             a = Tag(soup, 'a', [('href', url)])
             img = Tag(soup, 'img', [('style', "display: block; 'background-image': \"url(https://%s/img/reader/youtube_play.png), url(http://img.youtube.com/vi/%s/0.jpg)\"" % (fqdn, youtube_id)), ('src', 'http://img.youtube.com/vi/%s/0.jpg' % youtube_id)])
             a.insert(0, img)
             iframe.replaceWith(a)
         else:
             iframe.extract()
     
     return unicode(soup)

Beispiel #26

0

Datei anzeigen

Datei: views.py Projekt: Godrik-xp/myTZM

def AllCategories(request):
	print 'allcat'
	x = BeautifulSoup()
	#root = Tag(x,'ul', [('class', "tree"), ( 'id', "tree")])
	#x.insert(0,root)
	AllCategories = RECategory.objects.filter(parent__isnull=True).order_by('-number')
	
	AllAnswered = {}
    #в logs добавляем только самые поздние по дате RELog
	for log in RELog.objects.filter(user=request.user).order_by('-date'):
		if not log.category_id in AllAnswered:
			AllAnswered[log.category_id] = {}
		if not log.type_log in AllAnswered[log.category_id]:
			AllAnswered[log.category_id][log.type_log] = log
	for category in AllCategories:
		print category.id
		nt = Tag(x,'li', [("id", str(category.id))])
		log = AllAnswered.get(category.id)
		rating = ''
		if log:
			log = log.get(5)
			if log :
				rating = 'Оценка: ' + str(log.rating)
		div = Tag(x,'div')
		div.string = rating
		div["class"] = "rating"
		#div["style"] = "width: 150px; float: right;"
		nt.insert(0, div)
		
		if category.is_3d:
			isDDD = "Есть";
		else:
			isDDD = "Нет";
		div = Tag(x,'div')
		div.string = isDDD 
		div["class"] = "is3d"
		#div["style"] = "margin-right: 0px;width: 110px; float: right;"
		nt.insert(0, div)
		
		div = Tag(x,'div')
		div["class"] = "demo"
		#div["style"] = "margin-right: 0px;width: 110px; float: right;"
		div.string = str(category.type_category)
		nt.insert(0, div)
		
		div = Tag(x,'div')
		div.string = category.name
		nt.insert(0, div)
		
		x.insert(0,nt)
		recurseCategories(category, nt, x, AllAnswered)
	res = x.prettify()
	#print res
	print 'endallcat'
	return res

Beispiel #27

0

Datei anzeigen

Datei: text_parser.py Projekt: HiPiH/life

def add_noindex_to_a(text):
    doc = BeautifulSoup(text)
    host_orig = urlparse(settings.SITE_URL)[1]
        
    for a in doc.findAll('a'):
        try:
            host = urlparse(a['href'])[1]
        except:
            pass
        

        if a.findParent('noindex')==None:
            if host!=host_orig:
                noindex = Tag(doc,"noindex")
                a.replaceWith(noindex)
                a['rel']='nofollow'
                noindex.insert(0,a)
    return unicode(doc)

Beispiel #28

0

Datei anzeigen

Datei: migrateBloggerEntriesAndComments.py Projekt: timparkin/into-the-light

def replaceTag(target, targettag,tag=None):
    if len(targettag) >1:
        elems = target.findAll(targettag[0],targettag[1])
    else:
        elems = target.findAll(targettag[0])
    for element in elems:
        soup = BeautifulStoneSoup()
        contents = element.renderContents()
        contents = charReplacements(contents)
        if tag is None:
            element.replaceWith(contents)
        elif tag is 'newline':
            element.replaceWith('\n%s'%contents)
        else:
            t = Tag(soup, tag)
            t.insert(0,contents)
            element.replaceWith( t )
    return

Beispiel #29

0

Datei anzeigen

Datei: inliner.py Projekt: soldierkam/pynews

 def __replaceCss(self, baseUrl, soup):
     if self.__css != DO_NTH:
         for css in soup.findAll('link',{'rel':'stylesheet','href':re.compile('.+')}):
             try:
                 cssHref = css['href']
                 cssUrl = baseUrl.resolve(cssHref)
                 if self.__css == INLINE:
                     data = self.__contentResolver.getContent(cssUrl, False)[0]
                     cssContent = self.__inlineExternalResourcesInCss(cssUrl, data)
                 else:
                     cssContent = u"<!--" + str(cssUrl) + u"-->"
                 newStyleTag = Tag(soup, "style")
                 newStyleTag.insert(0,  MyNavigableString(cssContent))
                 if css.get('media'):
                     newStyleTag['media'] = css['media']
                 css.replaceWith(newStyleTag)
             except BaseException as e:
                 self.__logger.exception(u'failed to load css from %s' % css['href'])

Beispiel #30

0

Datei anzeigen

Datei: inliner.py Projekt: soldierkam/pynews

 def __replaceJavascript(self, baseUrl,soup):
     if self.__js != DO_NTH:
         for js in soup.findAll('script'):
             src = js.get("src") if js.get('src') else None
             try:
                 if src and self.__js == INLINE:
                     jsContent = self.__contentResolver.getContent(baseUrl.resolve(src), False)
                 elif self.__js == REMOVE:
                     u = str(baseUrl.resolve(src)) if src else "inlined"
                     jsContent = "<!--" + u + "-->"
                 else:
                     #nothing to change
                     continue
                 newScriptTag = Tag(soup, "script")
                 newScriptTag.insert(0,  MyNavigableString(jsContent))
                 js.replaceWith(newScriptTag)
             except BaseException as e:
                 self.__logger.error(u'failed to load javascript from %s' % unicode(src))

Beispiel #31

0

Datei anzeigen

    def sanitize_story(self, story_content):
        soup = BeautifulSoup(story_content.strip())
        fqdn = Site.objects.get_current().domain

        for iframe in soup("iframe"):
            url = dict(iframe.attrs).get('src', "")
            youtube_id = self.extract_youtube_id(url)
            if youtube_id:
                a = Tag(soup, 'a', [('href', url)])
                img = Tag(soup, 'img', [(
                    'style',
                    "display: block; 'background-image': \"url(https://%s/img/reader/youtube_play.png), url(http://img.youtube.com/vi/%s/0.jpg)\""
                    % (fqdn, youtube_id)),
                                        ('src',
                                         'http://img.youtube.com/vi/%s/0.jpg' %
                                         youtube_id)])
                a.insert(0, img)
                iframe.replaceWith(a)
            else:
                iframe.extract()

        return unicode(soup)

Beispiel #32

0

Datei anzeigen

Datei: scraper.py Projekt: sivartravis/environmental_notices_fusion

    def geo_term_extract(self, desc):
        data = values = {
            'maxRows': '1',
            'fuzzy': '1',
            'country': 'EE',
            'featureClass': 'P',
            'operator': 'OR',
            'username': self.geonames_user,
            'q': desc.encode('utf-8')
        }
        data = urllib.urlencode(values)

        link = u"http://api.geonames.org/search"
        xmldata = urllib.urlopen(link, data)
        soup = BeautifulSoup(xmldata)
        #   print soup.prettify()
        lng = '0'
        lat = '0'
        if len(soup.findAll("lat")) > 0:
            lng = soup.findAll("lng")[0].text
            lat = soup.findAll("lat")[0].text
            lat_f = float(lat)
            lng_f = float(lng)
            lat = '%.5f' % ((lat_f * 10000 + random.uniform(1, 80)) / 10000)
            lng = '%.5f' % ((lng_f * 10000 + random.uniform(1, 80)) / 10000)

        soup2 = BeautifulSoup()
        tag1 = Tag(soup2, "Point")
        tag2 = Tag(soup2, "coordinates")
        soup2.insert(0, tag1)
        tag1.insert(0, tag2)
        text = NavigableString(lng + "," + lat)
        tag2.insert(0, text)
        #   print soup2
        result = (soup2.__str__()).encode("utf-8")
        return [result, lat, lng]

Beispiel #33

0

Datei anzeigen

Datei: views.py Projekt: Godrik-xp/myTZM

def recurseCategories(parentCat, root, x, AllAnswered):
	childcats = parentCat.children()
	if childcats:
		nt = Tag(x,'ul', [('style', 'display:none')])
		root.insert(len(root.contents),nt)
		root = nt
	for category in childcats:
		root.insert(len(root.contents),'\n')
		nt = Tag(x,"li", [("id", str(category.id))])		
		log = AllAnswered.get(category.id)
		rating = ''
		if log:
			log = log.get(5)
			if log :
				rating = 'Оценка: ' + str(log.rating)
		div = Tag(x,'div')
		div.string = rating
		div["class"] = "rating"
		#div["style"] = "width: 150px; float: right;"
		nt.insert(0, div)
		
		if category.is_3d:
			isDDD = "Есть";
		else:
			isDDD = "Нет";
		div = Tag(x,'div')
		div.string = isDDD 
		div["class"] = "is3d"
		#div["style"] = "margin-right: 0px;width: 110px; float: right;"
		nt.insert(0, div)
		
		div = Tag(x,'div')
		div["class"] = "demo"
		#div["style"] = "margin-right: 0px;width: 110px; float: right;"
		div.string = str(category.type_category)
		nt.insert(0, div)
		
		div = Tag(x,'div')
		div.string = category.name
		nt.insert(0, div)
		
		root.insert(len(root.contents), nt)
		
		recurseCategories(category, nt, x, AllAnswered)

Beispiel #34

0

Datei anzeigen

Datei: ans.py Projekt: radioanonymous/Talho

def main(bot, args):
	'''Ответить слушателю. Параметры: <user_id> <message>
Если в качестве user_id указать восклицательный знак, сообщение будет выглядеть как объявление.
Если в качестве user_id указать символ @ (или " в русской раскладке), будет использован идентификатор последнего поста. Использовать ОСТОРОЖНО!
? user_id — заблеклистить юзера user_id, его сообщения перестанут поступать в диджейку.
?? — показать блеклист.
?! — очистить блеклист.'''
	syl = { '0' : 'be', '1' : 'sa', '2' : 'ko', '3' : 'pa', '4' : 're', '5' : 'du', '6' : 'ma', '7' : 'ne', '8' : 'wa', '9' : 'si', 'a' : 'to', 'b' : 'za', 'c' : 'mi', 'd' : 'ka', 'e' : 'ga', 'f' : 'no' }
	salt = bot.settings["ans_salt"]
	message_limit = 250
	userpost = ""
	if len(args) == 1 and args[0] != "??" and args[0] != "?!" or not len(args):
		return
	blacklisting = False
	if args[0] != "!":
		if args[0] == "??":
			return _("blacklist:\n%s") % "\n".join(bot.blacklist)
		if args[0] == "?!":
			bot.blacklist = []
			return _("blacklist cleared.")
		if args[0] == "?":
			blacklisting = True
			del args[0]
		if args[0] == "@" or args[0] == '"':
			sender = bot.last_user_id
		elif args[0].isdigit() and int(args[0]) >= 10 and int(args[0]) < 100:
			sender = bot.num2uid[int(args[0])]
		else:
			sender = args[0]
		if len(sender) != 12:
			return _("incorrect name entered, should be 12 symbols.")
		check = md5()
		check.update(sender[:8].encode('utf-8') + salt)
		if check.hexdigest()[:4] != sender[8:12]:
			return _("incorrect name entered (checksum invalid).")
	
		if blacklisting:
			bot.blacklist.append(sender)
			return _("%s was added to blacklist.") % sender

		to = ">>" + sender
		if sender in bot.usersposts:
			userpost = "<span class=\"userpost\">&gt; " + escape(bot.usersposts[sender]) + "</span><br/>"
	else:
		to = "!"
        message = " ".join(args[1:])
	if len(message) > message_limit:
		return _("too long answer, should be less than %d symbols, you entered %d symbols.") % (message_limit, len(message))
        soup = BeautifulSoup(open(bot.settings["ans_file"], "r"))
	posts = soup.findAll('p')
	new_post = Tag(soup, 'p')
	user_id = Tag(soup, 'span', [('id', 'user_id')])
	if to != "!":
		user_id.insert(0, escape(to))
	else:
		user_id.insert(0, "<b>&gt;&gt;ОБЪЯВЛЕНИЕ&lt;&lt;</b>")
	new_post.insert(0, '<span class="timestamp">[' + datetime.datetime.strftime(datetime.datetime.now(), "%H:%M:%S") + ']</span>')
	new_post.insert(1, user_id)
	message = re.sub(r'\[([^]]*)\]', lambda x: '<a href="' + x.group(1).replace("&amp;", "&") + '" target="_blank">' + x.group(1) + '</a>', escape(message))
	message = re.sub(r'\{([^}]*)\}', lambda x: '<a href="' + x.group(1).replace("&amp;", "&") + '" target="_blank"><img style="max-width: 200px; max-height: 200px;display: inline;" src="' + x.group(1).replace("&amp;", "&") + '"/></a>', message)
	new_post.insert(2, userpost + message)
	if len(posts) > 0:
		posts[0].parent.insert(2, new_post)
	else:
		soup.find('h1').parent.insert(1, new_post)
	if len(posts) > 9:

		posts[len(posts) - 1].extract()

	f = open(bot.settings["ans_file"], "w")
	f.write(soup.prettify())
	f.close()
        
        return _("sent.")

Beispiel #35

0

Datei anzeigen

def main():
    """Create an XML database containing a word from the GNT, its PROIEL ID # and other PROIEL info."""

    aligned = codecs.open("aligned-gospels.wds", "rU", "utf-8")

    xml = codecs.open("proiel-GNT.xml", "rU", "utf-8")

    print "Parsing the PROIEL XML with BeautifulStoneSoup..."
    print

    proiel = BeautifulStoneSoup(xml)

    tokens = proiel.findAll('token')

    tok_dict = {}

    # creating a dictionary keyed by PROIEL IDs to speed up searching
    for token in tokens:
        tok_dict[token['id']] = token

    output = open("gospels-database.xml", "w")

    print >> output, "<div>"

    print >> output, "<title>Gospels</title>"

    count = 100001

    soup = BeautifulStoneSoup()

    word = Tag(soup, "word")

    print "Iterating through the alignment file..."
    print

    for line in aligned:
        stuff = line.split("\t")
        word = Tag(soup, "word")
        form = NavigableString(stuff[0])
        word.insert(0, form)
        # make it so that the IDs count up from 000000, not 100000
        word['id'] = str(count).replace("1", "0", 1)
        word['proiel-id'] = stuff[1]

        # adding attributes from the PROIEL XML
        if stuff[1] != "000000" and stuff[1] != "999999" and stuff[1] != "111111":
            token = tok_dict[stuff[1]]
            morph = token['morph-features'].split(",")
            word['lemma'] = morph[0]
            word['proiel-pos'] = morph[1]
            word['lang'] = morph[2]
            word['morph'] = morph[3]
            word['deprel'] = token['relation']
            try:
                word['proiel-head-id'] = token['head-id']
            except KeyError:
                word['proiel-head-id'] = "root"
        word['proiel-form'] = stuff[2].rstrip()
        count += 1
        print >> output, word

    print >> output, "</div>"

    print "Done!"
    print

Beispiel #36

0

Datei anzeigen

    def parse_day(self, soup):
        for s in soup.findAll(
                lambda tag: tag.name == 'strong' and tag.contents == []):
            s.extract()

        self.url = ''

        if self.date >= '2011-12-12':
            body_div = soup.find('div', 'grid_10') or soup.find(
                'div', 'grid_7')
            if not body_div:
                raise ContextException, 'Could not find div containing main content.'

            body = body_div.findAll('p')

            nia_heading_re = re.compile(r'Session: 2011/2012')
            if not nia_heading_re.match(''.join(body[0](text=True))):
                raise ContextException, 'Missing NIA heading!'
            date_head = body[1].find(text=True)
            body = body[3:]  # body[2] is a PDF download link or ISBN
        else:
            body = soup('p')
            nia_heading_re = re.compile(
                r'''
				(the)?(\s|&nbsp;|<br>)*
				(transitional)?(\s|&nbsp;|<br>)*
				(
					northern(\s|&nbsp;|<br>)*
					ireland(\s|&nbsp;|<br>)*
				)?
				assembly
				''', re.IGNORECASE | re.VERBOSE)
            if not nia_heading_re.match(''.join(body[0](text=True))):
                raise ContextException, 'Missing NIA heading!'

            date_head = body[1].find(text=True)
            body = body[2:]

        timestamp = ''
        self.speaker = (None, timestamp)
        self.text = ''
        for p in body:
            ptext = re.sub("\s+", " ", ''.join(p(text=True)))
            phtml = re.sub("\s+", " ", p.renderContents()).decode('utf-8')
            #print p, "\n---------------------\n"
            if p.a and re.match('[^h/]', p.a.get('href', '')):
                continue
            if re.match('(&nbsp;)+$', ptext) or ptext == '':
                continue
            try:
                cl = p['class']
            except KeyError:
                raise ContextException, 'Missing class on paragraph: %s' % p
            cl = re.sub(' style\d', '', cl)

            if cl == 'OralAnswers':
                # Main heading, or departmental heading (in bold)
                if ptext == 'Oral Answers to Questions' or (p.find(
                        'strong', recursive=False) and len(p.contents) == 1):
                    cl = 'H3SectionHeading'
                elif re.match('\d+\.( |&nbsp;)+<strong>', phtml):
                    cl = 'B1SpeakersName'
                elif p.strong:
                    raise ContextException, 'Unhandled <strong> found in %s' % p
                else:
                    cl = 'H4StageHeading'
            if cl == 'OralWrittenQuestion' or cl == 'OralAnswers-Question':
                cl = 'B1SpeakersName'
            if cl in ('H1DocumentHeading', 'OralWrittenAnswersHeading',
                      'OralAnswers-H1Heading', 'WrittenStatement-Heading',
                      'H3SubHeading', 'OralAnswers-H2DepartmentHeading'):
                cl = 'H3SectionHeading'
            if cl in ('H4StageHeadingCxSpFirst', 'H4StageHeadingCxSpLast',
                      'OralAnswers-H3SubjectHeading'):
                cl = 'H4StageHeading'
            if cl == 'WrittenStatement-Content' or cl == 'B1BodyText-NumberedList' or cl == 'B2BodyTextBullet1':
                cl = 'B3BodyText'
            if cl == 'B3BodyText' and (phtml[0:8] == '<strong>' or re.match(
                    '\d+\.( |&nbsp;)+<strong>', phtml)):
                cl = 'B1SpeakersName'
            if cl == 'TimePeriod' and re.search('in the chair(?i)', phtml):
                cl = 'B3SpeakerinChair'
            if cl == 'B1BodyTextQuote':
                cl = 'B3BodyTextItalic'
            if p.em and len(p.contents) == 1:
                cl = 'B3BodyTextItalic'

            if cl == 'H3SectionHeading':
                self.new_major_heading(ptext, timestamp)
            elif cl == 'H4StageHeading' or cl == 'H5StageHeading' or cl == 'B3BodyTextClause':
                self.new_minor_heading(ptext, timestamp)
            elif cl == 'B1SpeakersName':
                self.display_speech()
                m = re.match('.*?:', phtml)
                if not p.strong and m:
                    newp = Tag(soup, 'p', [('class', 'B1SpeakersName')])
                    newspeaker = Tag(soup, 'strong')
                    newspeaker.insert(0, m.group())
                    newp.insert(0, phtml.replace(m.group(), ''))
                    newp.insert(0, newspeaker)
                    p = newp
                m = re.match('([0-9]+\. )(.*?) asked', phtml)
                if not p.strong and m:
                    newp = Tag(soup, 'p', [('class', 'B1SpeakersName')])
                    newspeaker = Tag(soup, 'strong')
                    newspeaker.insert(0, m.group(2))
                    newp.insert(0, phtml.replace(m.group(), ' asked'))
                    newp.insert(0, newspeaker)
                    newp.insert(0, m.group(1))
                    p = newp
                if re.search("<strong>O(&rsquo;|')Neill\)?</strong>", phtml):
                    newp = Tag(soup, 'p', [('class', 'B1SpeakersName')])
                    newspeaker = Tag(soup, 'strong')
                    newspeaker.insert(0, re.sub('</?strong>', '', m.group()))
                    newp.insert(0, phtml.replace(m.group(), ''))
                    newp.insert(0, newspeaker)
                    p = newp
                if not p.strong:
                    raise ContextException, 'No strong in p! %s' % p
                self.new_person_speak(p, timestamp)
            elif cl in ('B3BodyTextItalic', 'Q3Motion',
                        'BillAmend-AmendedText', 'BillAmend-Moved',
                        'BillAmend-withMinister', 'BillAmend-AmendMade',
                        'BillAmend-ClauseHeading', 'AyesNoes',
                        'AyesNoesParties', 'AyesNoesVotes', 'D3PartyMembers',
                        'B3SpeakerinChair', 'B3BodyTextSpeakerintheChair',
                        'H2DocumentStartTime', 'AyesNoesDivisionTellers',
                        'CommunityVoteTable'):
                match = re.match(
                    'The Assembly met at ((\d\d?)\.(\d\d) (am|pm)|noon)',
                    phtml)
                if match:
                    if match.group(1) == 'noon':
                        timestamp = '12:00'
                    else:
                        hour = int(match.group(2))
                        if hour < 12 and match.group(4) == 'pm':
                            hour += 12
                        timestamp = "%s:%s" % (hour, match.group(3))
                    self.speaker = (self.speaker[0], timestamp)
                self.new_italic_speech(ptext, phtml)
            elif cl in ('Q3MotionBullet', 'BillAmend-AmendedTextIndent',
                        'BillAmend-AmendedTextIndent2',
                        'BillAmend-AmendedTextIndent3',
                        'BillAmend-QuotewithMinister'):
                self.text += '<p class="indentitalic">%s</p>\n' % phtml
            elif cl in ('B3BodyText', 'B3BodyTextnoindent',
                        'RollofMembersList', 'TableText'):
                self.text += '<p>%s</p>\n' % phtml
            elif cl == 'Q1QuoteIndented' or cl == 'Q1Quote':
                self.text += '<p class="indent">%s</p>\n' % phtml
            elif cl == 'TimePeriod':
                timestamp = self.time_period(ptext)
            elif cl == 'MsoNormal':
                continue
            else:
                raise ContextException, 'Uncaught paragraph! %s %s' % (cl, p)
        self.display_speech()

Beispiel #37

0

Datei anzeigen

Datei: html_IPA.py Projekt: hongjyan/study

def update_testCase_result(src, soup):
    #print src
    localtime = time.localtime()
    updateTime = "%s_%s_%s_%s_%s" % (localtime[0], localtime[1], localtime[2],
                                     localtime[3], localtime[4])
    head = soup.h1
    #update head
    head.contents[0].replaceWith("BU test report %s" % updateTime)
    table_map = {
        "BU sanity test result. URL:": [
            "U6_BU_CI",
        ],
    }

    if not re.search("fp_version", src):
        tc_name = re.search("name=(.*?) ", src).group(1).strip("HZ-").strip()
        verdict = re.search("verdict=(.*?) ", src).group(1).strip()
        assc = re.search("assc=(.*?) ", src).group(1).strip()
        tw = re.search("tw=(.*?) ", src).group(1).strip()
        mgw = re.search("mgw=(.*?) ", src).group(1).strip()
        script = re.search("script=(.*?) ", src).group(1).strip()
        boa = re.search("boa=(.*?) ", src).group(1).strip()
        nelmon = re.search("nelmon=(.*?) ", src).group(1).strip()
        link = re.search("link=(.*)", src).group(1).strip()

        try:
            tc = soup.find(text=tc_name)  #node of text:test case name in soup
            #print tc
            tc.previous['href'] = link  #update link
            verdict_list = tc.parent.parent.findNextSiblings(
                'td', limit=7)  #verdict, tw, nelmon, assc, script, mgw, boa
            #print verdict_list
            #update verdict
            if "PASS" == verdict:
                tc.parent.parent['bgcolor'] = "green"
                verdict_list[0]['bgcolor'] = "green"
                verdict_list[0].contents[0].replaceWith("PASS")
            elif "FAIL" == verdict:
                tc.parent.parent['bgcolor'] = "red"
                verdict_list[0]['bgcolor'] = "red"
                verdict_list[0].contents[0].replaceWith("FAIL")
            elif "WARNING" == verdict:
                tc.parent.parent['bgcolor'] = 'yellow'
                verdict_list[0]['bgcolor'] = 'yellow'
                verdict_list[0].contents[0].replaceWith('WARNING')

            #update TW
            if "PASS" == tw:
                verdict_list[1]['bgcolor'] = "green"
                verdict_list[1].contents[0].replaceWith("PASS")
            elif "FAIL" == tw:
                verdict_list[1]['bgcolor'] = "red"
                verdict_list[1].contents[0].replaceWith("FAIL")

            #update Nelmon
            if "PASS" == nelmon:
                verdict_list[2]['bgcolor'] = "green"
                verdict_list[2].contents[0].replaceWith("PASS")
            elif "FAIL" == nelmon:
                verdict_list[2]['bgcolor'] = "red"
                verdict_list[2].contents[0].replaceWith("FAIL")

            #update assc
            if "PASS" == assc:
                verdict_list[3]['bgcolor'] = "green"
                verdict_list[3].contents[0].replaceWith("PASS")
            elif "FAIL" == assc:
                verdict_list[3]['bgcolor'] = "red"
                verdict_list[3].contents[0].replaceWith("FAIL")

            #update script
            if "PASS" == script:
                verdict_list[4]['bgcolor'] = "green"
                verdict_list[4].contents[0].replaceWith("PASS")
            elif "FAIL" == script:
                verdict_list[4]['bgcolor'] = "red"
                verdict_list[4].contents[0].replaceWith("FAIL")

            #update mgw
            if re.search("PASS", mgw):
                verdict_list[5]['bgcolor'] = "green"
                verdict_list[5].contents[0].replaceWith("PASS")
            elif re.search("FAIL", mgw):
                verdict_list[5]['bgcolor'] = "red"
                verdict_list[5].contents[0].replaceWith("FAIL")
            elif re.search("ALERT|CRITICAL", mgw):
                verdict_list[5]['bgcolor'] = "#800000"
                verdict_list[5].contents[0].replaceWith("CRITICAL")
                tc.parent.parent['bgcolor'] = "#800000"

            #update boa
            if "PASS" == boa:
                verdict_list[6]['bgcolor'] = "green"
                verdict_list[6].contents[0].replaceWith("PASS")
            elif "FAIL" == boa:
                verdict_list[6]['bgcolor'] = "red"
                verdict_list[6].contents[0].replaceWith("FAIL")
        except:
            print "%s haven't been included in BU test cases, please contact with BU team" % tc_name
    else:
        execution_name = re.search("execution=(.*?) ", src).group(1).strip()
        mgw_version = re.search("mgw_version=(.*?)il", src).group(1).strip()
        #il_version = re.search("il_version=(.*?) ", src).group(1).strip()
        #fp_version = re.search("fp_version=(.*?) ", src).group(1).strip()
        #prbs_version = re.search("prbs_version=(.*?) ", src).group(1).strip()
        url = re.search("url=(.*)", src).group(1).strip()

        #since there is "\n" at the end of every line, so need nextSibling 2 times
        #if mgw_version and il_version and fp_version and prbs is NA or empty, then update info. Otherwise, skip.
        #update mgw_version
        MGW = soup.find(text="release lable:")
        if MGW.parent.nextSibling.nextSibling.contents[0] == "NA" or \
        MGW.parent.nextSibling.nextSibling.contents[0] == "":
            MGW.parent.nextSibling.nextSibling.contents[0].replaceWith(
                mgw_version)
        #update il_version
        #IL = soup.find(text="IL version:")
        #if IL.parent.nextSibling.nextSibling.contents[0] == "NA" or \
        #        IL.parent.nextSibling.nextSibling.contents[0] == "":
        #    IL.parent.nextSibling.nextSibling.contents[0].replaceWith(il_version)
        #update fp_version
        #FP = soup.find(text="FP version:")
        #if FP.parent.nextSibling.nextSibling.contents[0] == "NA" or \
        #        FP.parent.nextSibling.nextSibling.contents[0] == "":
        #    FP.parent.nextSibling.nextSibling.contents[0].replaceWith(fp_version)
        #updat prbs_version
        #PRBS = soup.find(text= "PRBs version:")
        #if PRBS.parent.nextSibling.nextSibling.a['href'] == "NA":
        #    PRBS.parent.nextSibling.nextSibling.a['href'] = prbs_version
        #    PRBS.parent.nextSibling.nextSibling.contents[0].contents[0].replaceWith(prbs_version)
        #updat urls for executions
        for k in table_map.keys():
            n = 1
            for i in table_map[k]:
                #if re.search(i, prbs_version): #Use in Open MGW
                if True:  #use in IPA
                    #print k
                    if soup.find(
                            text=re.compile("%s.*" % k)
                    ) == None:  #if update sanity test cases result, go to next execution
                        break
                    node = soup.find(text=re.compile("%s.*" % k)).parent
                    temp_soup = BeautifulSoup()
                    tag = Tag(temp_soup, 'a')
                    text = NavigableString("%s" % url)
                    tag.insert(0, text)
                    tag['href'] = "%s" % url
                    node.insert(n, tag)
                    #print node
                    n = n + 1

Beispiel #38

0

Datei anzeigen

Datei: sitemapgen.py Projekt: srobo-legacy/srweb-devel

    # No external pages
    realurl = urldata('realurl')[0].text
    if not realurl.startswith(SEARCHED_ROOT):
        continue

    parturl = realurl[len(SEARCHED_ROOT):]

    if any(parturl.startswith(prefix) for prefix in IGNORE_PREFIXES):
        continue

    q_idx = parturl.find('?')
    if q_idx >= 0:
        parturl = parturl[:q_idx]

    parturls.add(parturl)

print >> sys.stderr, 'Building sitemap'

for parturl in sorted(parturls, reverse=True):
    url = Tag(sitemap, 'url')
    urlset.insert(1, url)
    loc = Tag(sitemap, 'loc')
    url.insert(0, loc)
    text = NavigableString(SITE_ROOT + parturl)
    loc.insert(0, text)

print >> sys.stderr, 'Outputting sitemap'

print sitemap.prettify()

Beispiel #39

0

Datei anzeigen

    def parse_content(self, content, attachments, tags):

        soup = BeautifulSoup(content)
        pattern = re.compile(r'<.*?src="\?hash=(\w+?)".*?>')

        # images
        for match in soup.findAll('img'):

            filehashmatch = pattern.search(str(match))
            if filehashmatch:
                filehash = filehashmatch.group(1)
                filename = next(
                    (l['filename']
                     for l in attachments if l['hash'] == filehash), None)

                if filename is not None:
                    importedname = self.import_file(filename)
                    match.replaceWith(Tag(soup, 'img',
                                          [('src', importedname)]))

        # pdfs
        for match in soup.findAll('embed', {"type": "evernote/x-pdf"}):

            filehashmatch = pattern.search(str(match))
            if filehashmatch:
                filehash = filehashmatch.group(1)
                filename = next(
                    (l['filename']
                     for l in attachments if l['hash'] == filehash), None)

                if filename is not None:
                    # convert pdf -> image
                    images = pdf2image(filename)

                    # import each jpg
                    imageTags = Tag(soup, "span")
                    for image in images:
                        importedname = self.import_file(image)
                        # add new image tag
                        imageTags.insert(
                            images.index(image),
                            Tag(soup, 'img', [('src', importedname)]))

                    # replace embed with <img src...> for each image
                    match.replaceWith(imageTags)

        # audio
        # video

        #plugins

        #highlights
        # TODO: test
        # <span style="background-color: rgb(255, 204, 102); ">some text...</span>
        # -> <span class="highlight" style="background-color: rgb(255, 204, 102); ">some text...</span>
        #
        # if mw.col.conf.get(SETTING_TAG_HIGHLIGHTS, False) in tags:
        #     matches = soup.find(string=re.compile("<span style=\"background-color: rgb([0-9]+, [0-9]+, [0-9]+); \">.*</span>"))
        #     if matches is not None:
        #         for match in matches:
        #             match['class'] = match.get('class', []) + ['highlight']
        #
        #

        # TODO: qa
        #for match in soup.find(string=re.compile("A:")):
        #    match['class'] = match.get('class', []) + ['Evernote2Anki-Highlight']

        return str(soup).decode('utf-8')

Beispiel #40

0

Datei anzeigen

def rd_parse_post(entry):
    blogger_id = entry.id
    created = entry.published.split('.')[:1][0].replace('T', ' ')
    updated = entry.updated.split('.')[:1][0].replace('T', ' ')

    link = entry.link  #[-1]
    url = link.replace('http://rugbydump.blogspot.com', '')
    title = entry.title.encode('ASCII', 'ignore')

    content = entry.summary
    content = renode.sub(node, content).encode('ASCII', 'ignore')

    # Fix up content a bit
    xcontent = bsoup(content)
    img = xcontent.img
    src = img['src'].split('/')[-1]
    img['src'] = '/media/posts/' + src
    img['alt'] = title

    del (img['border'])
    del (img['style'])
    del (img['id'])

    # Put a centererd paragraph around the image
    np = Tag(xcontent, 'p', [('style', 'text-align: center;')])
    np.insert(0, img)

    try:
        xcontent.a.replaceWith(
            np)  # Takes away the link around the first image
    except:
        xcontent.insert(
            0, np
        )  # Lol that was pretty important (just inserts it and the blank link will remain unfortunately)

    # Remove the last div
    xcontent.findAll('div', attrs={'class':
                                   'blogger-post-footer'})[0].extract()

    try:
        blurb = xcontent.span.contents[0]
    except:
        blurb = ''

    content = xcontent.prettify()

    try:
        numcomments = entry.thr_total
    except AttributeError:
        numcomments = 0

    try:
        return {
            'src': src,
            'created': created,
            'updated': updated,
            'url': url,
            'numcomments': numcomments,
            'blogger_id': blogger_id,
            'title': title,
            'blurb': blurb,
            'content': content,
        }
    except UnicodeDecodeError:
        print "Skipping post \"%s\".." % title
        return

Beispiel #41

0

Datei anzeigen

Datei: htmlparserutility.py Projekt: ninefu/Basic_Legal_Citation

            a = span.find("a")
            example = a["href"]
            if os.path.isfile(example):
                example = get_beautiful_file(example)
                ul = example.body.find("ul")
                if ul is not None:
                    span.append(ul)
                    count += 1
                    replace_spans(a, span, example, count)

    # find all a which are examples and replace them with the standart <span><a></a></span> pattern
        for a in soup.findAll("a", {"class": "example_icon"}):
            span = Tag(soup, "span", [("class", "example_icon")])
            soup.insert(0, span)
            example = a["href"]
            a["class"] = ""
            a.replaceWith(span)
            if os.path.isfile(example):
                example = get_beautiful_file(example)
                ul = example.body.find("ul")
                span.insert(0, a)
                span.insert(1, ul)
                count += 1
                replace_spans(a, span, example, count)

        new_filename = filename[:-4]
        new_filename = new_filename + "_new.html"
        f = open(new_filename, "w")
        f.write(soup.prettify())
        f.close()

Beispiel #42

0

Datei anzeigen

Datei: xbmclibrary.py Projekt: shaifbari/XBMC-Amazon.com-Prime-Streaming

def LIST_EPISODES_DB(seriestitle,
                     season,
                     poster,
                     HDonly=False,
                     path=False,
                     NFO=True):
    import tv as tvDB
    episodes = tvDB.loadTVEpisodesdb(seriestitle, season, HDonly)
    #asin,seriestitle,season,episode,episodetitle,url,plot,airdate,runtime,isHD,isprime,watched
    for asin, seriestitle, season, episode, episodetitle, url, plot, airdate, runtime, isHD, isprime, watched in episodes:
        episodetitle = episodetitle.replace(':', '').replace('/', ' ').replace(
            '[HD]', '').strip()
        if seriestitle in episodetitle:
            episodetitle = episodetitle.replace(
                seriestitle, '').strip().strip(',').strip('')
        if 'Season ' in episodetitle:
            episodetitle = episodetitle.replace('Season ', 'S')
        filename = 'S%sE%s - %s' % (season, episode,
                                    cleanfilename(episodetitle))
        CreateStreamFile(filename, url, path)
        if NFO:
            soup = BeautifulStoneSoup()
            episodedetails = Tag(soup, "episodedetails")
            soup.insert(0, episodedetails)
            episodedetails.insert(
                0, createElement('title', episodetitle + ' (Amazon)'))
            if season:
                episodedetails.insert(1, createElement('season', str(season)))
            if episode:
                episodedetails.insert(2, createElement('episode',
                                                       str(episode)))
            if plot:
                episodedetails.insert(3, createElement('plot', plot))
            if airdate:
                episodedetails.insert(4, createElement('aired', airdate))
                episodedetails.insert(5, createElement('premiered', airdate))
            episodedetails.insert(6, createElement('thumb', poster))

            fileinfo = createElement('fileinfo', '')
            streamdetails = createElement('streamdetails', '')
            audio = createElement('audio', '')
            audio.insert(0, createElement('channels', '2'))
            audio.insert(1, createElement('codec', 'aac'))
            streamdetails.insert(0, audio)
            video = createElement('video', '')
            video.insert(0, createElement('codec', 'h264'))
            if isHD:
                video.insert(1, createElement('height', '720'))
                video.insert(2, createElement('width', '1280'))
                video.insert(3, createElement('aspect', '1.778'))
            else:
                video.insert(1, createElement('height', '480'))
                video.insert(2, createElement('width', '640'))
            video.insert(3, createElement('scantype', 'Progressive'))
            streamdetails.insert(1, video)
            fileinfo.insert(0, streamdetails)
            episodedetails.insert(7, fileinfo)

            episodeNFO = os.path.join(path, filename + '.nfo')
            file = open(episodeNFO, 'w')
            file.write(str(soup))
            file.close()

Beispiel #43

0

Datei anzeigen

Datei: xbmclibrary.py Projekt: shaifbari/XBMC-Amazon.com-Prime-Streaming

def createElement(tagname, contents):
    soup = BeautifulSoup()
    element = Tag(soup, tagname)
    text = NavigableString(contents)
    element.insert(0, text)
    return element

Beispiel #44

0

Datei anzeigen

Datei: 06-replacing-tags.py Projekt: ithenis/python-course

from ReadFile import readFile
from BeautifulSoup import BeautifulSoup, Tag


doc = readFile("xml/book.xml")
soup =  BeautifulSoup(doc)

# replace all authors
tags = soup.findAll("author")
i = 0
for oldTag in tags:
    i = i + 1
    newTag = Tag(soup, "newTag", [("id", str(i))])
    newTag.insert(0, "text #" + str(i))
    oldTag.replaceWith(newTag)

print soup.prettify()


1

Beispiel #45

0

Datei anzeigen

        readable_article = Document(html).summary().encode('utf-8')
        readable_title = Document(html).short_title()

        manifest += '<item id="article_%s" href="article_%s.html" media-type="application/xhtml+xml"/>\n' % (
            i + 1, i + 1)
        spine += '<itemref idref="article_%s" />\n' % (i + 1)
        toc += '<navPoint id="navpoint-%s" playOrder="%s"> <navLabel> <text>%s</text> </navLabel> <content src="article_%s.html"/> </navPoint>' % (
            i + 2, i + 2, cgi.escape(readable_title), i + 1)

        soup = BeautifulSoup(readable_article)
        #Add xml namespace
        soup.html["xmlns"] = "http://www.w3.org/1999/xhtml"
        #Insert header
        body = soup.html.body
        h1 = Tag(soup, "h1", [("class", "title")])
        h1.insert(0, cgi.escape(readable_title))
        body.insert(0, h1)

        #Add stylesheet path
        head = soup.find('head')
        if head is None:
            head = Tag(soup, "head")
            soup.html.insert(0, head)
        link = Tag(soup, "link", [("type", "text/css"), ("rel", "stylesheet"),
                                  ("href", "stylesheet.css")])
        head.insert(0, link)
        article_title = Tag(soup, "title")
        article_title.insert(0, cgi.escape(readable_title))
        head.insert(1, article_title)

        #Download images

Beispiel #46

0

Datei anzeigen

Datei: transformation.py Projekt: roughscale/zopyx.smartprintng.lite

def makeImagesLocal(soup, params):
    """ deal with internal and external image references """

    for img in soup.findAll('img'):
        # 'internal' images are marked with class="internal resource"
        # in order to prevent image fetching later on
        if 'internal-resource' in (img.get('class') or ''):
            continue

        src = img['src']
        if params['request'] and src.startswith(params['request'].BASE0) \
            and '++resource++' not in src:
            src = src.replace(params['request'].BASE0 + '/', '')

        if src.startswith('http'):
            try:
                img_data = urllib2.urlopen(str(src)).read()

            except urllib2.URLError:
                LOG.warn('No image found: %s - removed from output' % src)
                img.extract()
                continue

            tmpname = tempfile.mktemp(dir=params['destdir'])
            file(tmpname, 'wb').write(img_data)
            img['src'] = os.path.basename(tmpname)

        else:
            # image with relative URL

            # first lookup image by direct traversal
            img_path = urllib.unquote(str(src))
            img_obj = params['context'].restrictedTraverse(img_path, None)
            if img_obj is None:
                img_path2 = getToolByName(
                    params['context'], 'portal_url').getPortalPath() + img_path
                img_obj = params['context'].restrictedTraverse(img_path2, None)

            if img_obj is None and 'resolveuid' in src:
                mo = uid_reg.search(src)
                if mo:
                    uid = mo.group(0)
                    img_obj = params['context'].reference_catalog.lookupObject(
                        uid)

            # For scaled images ('_preview', '_large' etc.) use the original
            # image always (which is stored as acquisition parent)
            if img_obj:
                has_portal_type = hasattr(aq_base(img_obj.aq_inner),
                                          'portal_type')
                if has_portal_type and img_obj.portal_type == img_obj.aq_parent.portal_type:
                    img_obj = img_obj.aq_parent

            if img_obj is None:
                # nothing found, check the next parent node with a 'path' parameter
                # referring to the origin document
                parent_container_path = pathFromParent(soup, img)
                if parent_container_path is not None:
                    img_obj = params['context'].restrictedTraverse(
                        '%s/%s' % (parent_container_path, img_path), None)

            # still nothing found
            if img_obj is None:

                img_split = img_path.split('/')
                if img_split[-1].startswith(
                        'image_') or img_split[-1].startswith('image-'):
                    img_path = '/'.join(img_split[:-1])
                for image_path in params['images']:
                    if image_path.endswith(img_path):
                        img_obj = params['context'].restrictedTraverse(
                            image_path, None)
                        break

            # get hold of the image in original size
            if img_obj:
                # thumbnails have an Image as aq_parent
                if img_obj.aq_parent.portal_type == 'Image':
                    img_obj = img_obj.aq_parent

            if img_obj:
                img_data = None
                for attr in ['data', '_data']:
                    try:
                        img_data = str(getattr(img_obj, attr))
                        continue
                    except AttributeError:
                        pass
                if img_data == None:
                    LOG.warn('No image found: %s - removed from output' %
                             img_path)
                    img.extract()
                    continue

                tmpname = tempfile.mktemp(dir=params['destdir'])
                file(tmpname, 'wb').write(img_data)
                img['src'] = os.path.basename(tmpname)

                # image scaling
                try:
                    scale = img_obj.getField('pdfScale').get(img_obj)
                except AttributeError:
                    scale = 100

                # add content-info debug information
                # don't add scale as style since the outer image-container
                # has the style set
                img['scale'] = str(scale)

                # now move <img> tag into a dedicated <div>
                div = Tag(soup, 'div')
                div['class'] = 'image-container'
                #                div['style'] = 'width: %d%%' % scale
                div['scale'] = str(scale)
                div.insert(0, copy.copy(img))

                # image caption
                img_description = img_obj.Description()
                img_caption = Tag(soup, 'div')
                img_caption['class'] = 'image-caption'

                # exclude from image enumeration
                context = params['context']
                exclude_field = img_obj.getField('excludeFromImageEnumeration')
                if exclude_field and not exclude_field.get(img_obj):
                    span = Tag(soup, 'span')
                    classes = ['image-caption-text']
                    description = img_obj.Description()
                    if description:
                        classes.append('image-caption-text-with-text')
                    else:
                        classes.append('image-caption-text-without-text')
                    span['class'] = ' '.join(classes)
                    if description:
                        span.insert(0, NavigableString(description))
                    img_caption.insert(0, span)
                    div.append(img_caption)

                img.replaceWith(div)

            else:
                LOG.warn('No image found: %s - not removed, keeping it' %
                         img_path)

Beispiel #47

0

Datei anzeigen

def main(argv=None):
    if argv == None:
        argv = sys.argv[1:]
    parser = make_option_parser()
    (options, args) = parser.parse_args(argv)
    log.debug(options)
    log.setLevel(options.verbose or logging.WARN)
    
    global minify
    if not options.minify:
        minify = lambda x, path=None: x
    elif options.google:
        minify = google_jar_minify
    elif options.google_rest:
        minify = google_minify
    if len(args) != 1:
        print "Invalid position arguments"
        parser.print_help()
        sys.exit(1)
    
    INPUT = args[0]
    OUTPUT = options.output
    options.initialImport = ""

    if INPUT.split('.')[-1] not in ('html', 'js', 'pkg'):
        print "Invalid input file; jsio_compile only operats on .js and .html files"
        sys.exit(1)
    
    compile_kwargs = {}
    if INPUT.endswith('pkg'):
        INPUT, options, compile_kwargs = \
            load_package_configuration(INPUT, options)
    output = \
        compile_source(INPUT, options, **compile_kwargs)
    
    # the root script needs to be able to recognize itself so that it can
    # figure out where it is. we modify the generated script to store the
    # expected script name. later on, we can compare that against script
    # tag src's.
#    output = \
#        output.replace(get_script_src_assignment('jsio.js'),
#                       get_script_src_assignment(os.path.basename(OUTPUT)))
                       
#    expose = re.compile('window\.jsio\s=\sjsio;');
#    expose = re.compile('this\.global\.jsio\s=\sjsio');
#    output = expose.sub(options.initialImport + (options.exposeJsio and ';this.global.jsio=jsio;' or ''), output, 1);
    output += options.initialImport;
    if options.minify:
        log.info("Minifying")
        output = minify(output, path='output')
    else:
        log.info("Skipping minify")
    print "Writing output %s" % OUTPUT
    
    # TODO: clean up this hack to write .html files back out
    if INPUT.endswith('.html'):
        orig_source = get_source(INPUT)
        soup = Soup(orig_source)
        orig_source = ""
        from BeautifulSoup import Tag
        tag1 = Tag(soup, "script")
        tag1.insert(0, output)
        soup.head.insert(0, tag1)
        output = soup.prettify()
        
    if OUTPUT:
        f = fileopen(OUTPUT, 'w')
        f.write(output)
        f.close()
    print "RETURNING", len(output)
    return output

Beispiel #48

0

Datei anzeigen

def addServerInfo(
    soup, serverinfo, uid, snamevalue, urlvalue, unamevalue, passwordvalue
):
    """
    @description: Adds server info to the soup
    @todo:None
    @param soup: soup
    @param serverinfo:
    @param uid: Unique Id of the server
    @param snamevalue: String, server name
    @param urlvalue: String, url of the server
    @param unamevalue: String, UserName for the server
    @param passwordvalue: String, password for the server
    @return: Boolean, True if added successfuly, else False
    """
    snamevalue = unicode(snamevalue)
    if ifServerNameExists(soup, snamevalue):
        return False
    else:
        server = Tag(soup, "server")
        serverinfo.insert(0, server)
        # Creating server info tags
        servername = Tag(soup, "servername")
        serverurl = Tag(soup, "serverurl")
        username = Tag(soup, "username")
        password = Tag(soup, "password")
        # Inserting server info fields
        server.insert(0, servername)
        server.insert(1, serverurl)
        server.insert(2, username)
        server.insert(3, password)
        # Adding attribute to server tag
        server["id"] = uid
        # Adding text values to the server info fields
        servername.insert(0, snamevalue)
        serverurl.insert(0, urlvalue)
        username.insert(0, unamevalue)
        password.insert(0, passwordvalue)
        return True

Beispiel #49

0

Datei anzeigen

Datei: xbmclibrary.py Projekt: shaifbari/XBMC-Amazon.com-Prime-Streaming

def LIST_TVSHOWS(NFO=False):
    import tv as tvDB
    shows = tvDB.loadTVShowdb(favorfilter=True)
    if (common.addon.getSetting('enablelibraryfolder') == 'true'):
        SetupAmazonLibrary()
    elif (common.addon.getSetting('customlibraryfolder') <> ''):
        CreateDirectory(MOVIE_PATH)
        CreateDirectory(TV_SHOWS_PATH)
    for seriestitle, plot, creator, network, genres, actors, year, stars, votes, episodetotal, watched, unwatched, isHD, isprime, favor, TVDBbanner, TVDBposter, TVDBfanart, TVDBseriesid in shows:
        directorname = os.path.join(TV_SHOWS_PATH,
                                    seriestitle.replace(':', ''))
        CreateDirectory(directorname)
        if NFO:
            soup = BeautifulStoneSoup()
            tvshow = Tag(soup, "tvshow")
            soup.insert(0, tvshow)
            tvshow.insert(0, createElement('title', seriestitle))
            if year:
                tvshow.insert(1, createElement('year', str(year)))
            if plot:
                tvshow.insert(2, createElement('plot', plot))
            if votes:
                tvshow.insert(3, createElement('votes', str(votes)))
            if stars:
                tvshow.insert(4, createElement('rating', str(stars)))
            if creator:
                tvshow.insert(5, createElement('credits', creator))
            if network:
                tvshow.insert(6, createElement('studio', network))
            if TVDBseriesid:
                tvshow.insert(7, createElement('id', TVDBseriesid))
                episodeguide = createElement('episodeguide', '')
                url = createElement(
                    'url',
                    'http://www.thetvdb.com/api/03B8C17597ECBD64/series/' +
                    TVDBseriesid + '/all/en.zip')
                url['cache'] = TVDBseriesid + '.xml'
                episodeguide.insert(0, url)
                tvshow.insert(8, episodeguide)
            if TVDBfanart:
                fanart_tag = createElement('fanart', '')
                fanart_tag['url'] = 'http://thetvdb.com/banners/'
                fanart_tag.insert(
                    0,
                    createElement(
                        'thumb',
                        TVDBfanart.replace('http://thetvdb.com/banners/', '')))
                tvshow.insert(9, fanart_tag)
            if TVDBposter:
                tvshow.insert(10, createElement('thumb', TVDBposter))
            elif TVDBbanner:
                tvshow.insert(11, createElement('thumb', TVDBbanner))
            index = 11
            if genres:
                for genre in genres.split(','):
                    index += 1
                    tvshow.insert(index, createElement('genre', genre))
            if actors:
                for actor in actors.split(','):
                    if actor <> None:
                        index += 1
                        actortag = createElement('actor', '')
                        actorname = createElement('name', actor)
                        actortag.insert(0, actorname)
                        tvshow.insert(index, actortag)
        seasonTotal, episodeTotal, seasons = LIST_TV_SEASONS(seriestitle, isHD)
        for season, poster, hasHD in seasons:
            name = 'Season ' + str(season)
            if hasHD:
                name += ' HD'
            seasonpath = os.path.join(directorname, name)
            CreateDirectory(seasonpath)
            if NFO:
                postertag = createElement('thumb', poster)
                postertag['type'] = 'season'
                postertag['season'] = str(season)
                index += 1
                tvshow.insert(index, postertag)
            LIST_EPISODES_DB(seriestitle,
                             int(season),
                             poster,
                             HDonly=hasHD,
                             path=seasonpath)
        if NFO:
            index += 1
            tvshow.insert(index, createElement('season', seasonTotal))
            index += 1
            tvshow.insert(index, createElement('episode', episodeTotal))
            tvshowNFO = os.path.join(directorname, 'tvshow.nfo')
            data = str(soup)
            if TVDBseriesid:
                data = 'http://thetvdb.com/index.php?tab=series&id=' + TVDBseriesid
                file = open(tvshowNFO, 'w')
                file.write(data)
                file.close()

Beispiel #50

0

Datei anzeigen

Datei: PTB-wikify.py Projekt: BackupTheBerlios/osxptb-svn

def mexhelpextract(mexnames):
    #print 'processing mex files: ' + mexnames.__repr__()
    from ConfigParser import RawConfigParser as ConfigParser, Error as error
    for mexname in mexnames:
        # ConfigParser for the three elements per subfunctions written to tmpdir
        # [SubFunction]
        # usage: 'xyz'
        # help: 'xyz'
        # seealso: 'xyz'
        config = ConfigParser({'usage': [], 'help': [], 'seealso': []})
        # assemble command line for matlab
        matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \
            (_tmpdir, \
             os.path.splitext(os.path.basename(_mexscript))[0], \
             mexname, \
             _tmpdir)
        cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd
        # and execute matlab w/ the temporary script we wrote earlier
        try:
            print 'running MATLAB for %s in %s' % (mexname, _tmpdir)
            stdin, stderr = os.popen4(cmd)
            print stderr.read()
            stdin.close()
            stderr.close()
        except:
            print 'could not dump help for %s into %s' % (mexname, _tmpdir)

        cfgfile = config.read(os.path.join(_tmpdir, mexname))
        if cfgfile == []:
            print "skipping " + mexname + " (no output)"
            continue
        subfunctions = config.sections()
        print 'processing subfunctions: ' + subfunctions.__repr__()
        for subfunction in subfunctions:
            # read in the strings for this subfunction
            usage = config.get(subfunction, 'usage')
            help = config.get(subfunction, 'help')
            seealso = config.get(subfunction, 'seealso')

            headline = '===[[' + subfunction + ' ' + mexname + '(\'' + subfunction + '\')]]===\n'
            breadcrumb = "==[[Psychtoolbox]] &#8250; [[" \
                                + mexname + "]].{mex*,dll} subfunction==\n\n"

            # scrub the text for main text only
            body = beackern(help)

            docstring = '' \
                    + '%%(matlab;Usage)' \
                    + usage \
                    + '%%\n' \
                    + body \
                    + '\n\n'
            if seealso:
                docstring = docstring + '<<=====See also:=====\n' + seealso + '<<'

            text =  '""' + headline \
                    + breadcrumb \
                    + docstring + '""'

            # retrieve old body text, to update or concatenate with synonymous subfunctions
            #
            # browse the page
            title = re.sub("[^\w]|_", "", subfunction)
            try:
                resp = mech.open(baseurl + title + "/edit")
            except HTTPError, e:
                sys.exit(
                    "retrieving old text during posting of this mex function failed: %d: %s"
                    % (e.code, e.msg))
            # get text from the edit form
            mech.select_form(nr=1)
            try:
                oldbody = mech["body"]
            except:
                print 'No id="body" form. Figure this out first. cf. page text above.'
                for form in mech.forms():
                    print form
                sys.exit(
                    "retrieving old body text failed while processing page: " +
                    baseurl + title + '/edit')

            # parse embedded structuring HTML tags in the wiki text
            soup = BeautifulSoup(oldbody)

            # check if the subfunction is already present, by CSS 'class' and 'id'
            subfct = soup.find('div', {'class': "subfct", 'id': mexname})
            if subfct:
                # replace the text of the container DIV
                subfct.contents[0].replaceWith(text)
            else:
                # contruct new DIV to hold the text
                subfctDIV = Tag(soup, "div")
                subfctDIV['class'] = 'subfct'
                subfctDIV['id'] = mexname
                subfctDIV.insert(0, NavigableString(text))

                # insert the new div
                soup.insert(len(soup), subfctDIV)

            # Now scoop the good well-formed divs out of the soup
            divs = soup('div', {'class': "subfct"})

            # and drop them into fresh yummy cheese soup
            cheesesoup = BeautifulSoup()

            # drop good divs into the soup, one by one
            for div in divs:
                # remove the unneeded style attribute, we finally
                # have this stuff defined in the ptbdocs.css now.
                del (div['style'])
                # escape the HTML tags for wiki parser
                cheesesoup.append(NavigableString('\n""'))
                cheesesoup.append(div)
                cheesesoup.append(NavigableString('""\n'))

            post(subfunction, cheesesoup.renderContents())

Beispiel #51

0

Datei anzeigen

Datei: web2epub.py Projekt: rklett/web2epub

def web2epub(urls,
             outfile=None,
             cover=None,
             title=None,
             author=None,
             images=None,
             footer=None,
             links=None,
             language=""):

    if (outfile == None):
        outfile = time.strftime('%Y-%m-%d-%S.epub')

    nos = len(urls)
    cpath = 'data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw=='
    ctype = 'image/gif'
    if cover is not None:
        cpath = 'images/cover' + os.path.splitext(os.path.abspath(cover))[1]
        ctype = mimetypes.guess_type(os.path.basename(
            os.path.abspath(cover)))[0]

    epub = MyZipFile(outfile, 'w', zipfile.ZIP_DEFLATED)

    # The first file must be named "mimetype"
    epub.writestr("mimetype", "application/epub+zip", zipfile.ZIP_STORED)
    # We need an index file, that lists all other HTML files
    # This index file itself is referenced in the META_INF/container.xml file
    epub.writestr(
        "META-INF/container.xml", '''<container version="1.0"
        xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
        <rootfiles>
            <rootfile full-path="OEBPS/Content.opf" media-type="application/oebps-package+xml"/>
        </rootfiles>
        </container>''')

    # The index file is another XML file, living per convention
    # in OEBPS/content.opf
    index_tpl = '''<package version="2.0"
        xmlns="http://www.idpf.org/2007/opf" unique-identifier="bookid">
        <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
        <dc:title>%(title)s</dc:title>
        <dc:creator>%(author)s</dc:creator>
        <dc:date>%(date)s</dc:date>
        <dc:language>%(lang)s</dc:language>
        <meta name="cover" content="cover-image" />
        </metadata>
        <manifest>
          <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
          <item id="cover" href="cover.html" media-type="application/xhtml+xml"/>
          <item id="cover-image" href="%(front_cover)s" media-type="%(front_cover_type)s"/>
          <item id="css" href="stylesheet.css" media-type="text/css"/>
            %(manifest)s
        </manifest>
        <spine toc="ncx">
            <itemref idref="cover" linear="yes"/>
            %(spine)s
        </spine>
        <guide>
            <reference href="cover.html" type="cover" title="Cover"/>
        </guide>
        </package>'''

    toc_tpl = '''<?xml version='1.0' encoding='utf-8'?>
        <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
                 "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
        <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
        <head>
        <meta name="dtb:depth" content="1"/>
        <meta name="dtb:totalPageCount" content="0"/>
        <meta name="dtb:maxPageNumber" content="0"/>
      </head>
      <docTitle>
        <text>%(title)s</text>
      </docTitle>
      <navMap>
        <navPoint id="navpoint-1" playOrder="1"> <navLabel> <text>Cover</text> </navLabel> <content src="cover.html"/> </navPoint>
        %(toc)s
      </navMap>
    </ncx>'''

    cover_tpl = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head>
        <title>Cover</title>
        <style type="text/css">
                img { max-width: 100%%; }
                .centerpage {text-align:center; vertical-align:middle; margin-right: 100px; margin-left: 100px;}
        </style>
        </head>
        <body>
        <div class="centerpage">
        <img src="%(front_cover)s" alt="Cover image"/>
        <h2>%(author)s</h2>
        <h1>%(title)s</h1>
        <div id="cover-image">
        </div>
        </div>
        </body>
        </html>'''

    stylesheet_tpl = '''
        p, body {
            orphans: 2;
            widows: 2;
        }
    '''

    manifest = ""
    spine = ""
    toc = ""
    icon = None

    for i, url in enumerate(urls):
        print "Reading URL %s of %s --> %s " % (i + 1, nos, url)
        ##try:
        req = urllib.urlopen(url)
        html = req.read()
        readable_article = None
        ##try:
        document = readability.Document(html)
        document.TEXT_LENGTH_THRESHOLD = 200  # Gives better results than default
        readable_article = document.summary()
        readable_title = document.short_title()
        ##except:
        ##continue

        if (readable_article == None):
            continue

        manifest += '<item id="article_%s" href="article_%s.html" media-type="application/xhtml+xml"/>\n' % (
            i + 1, i + 1)
        spine += '<itemref idref="article_%s" />\n' % (i + 1)
        toc += '<navPoint id="navpoint-%s" playOrder="%s"> <navLabel> <text>%s</text> </navLabel> <content src="article_%s.html"/> </navPoint>' % (
            i + 2, i + 2, readable_title.encode('ascii',
                                                'xmlcharrefreplace'), i + 1)

        try:
            soup = BeautifulSoup(readable_article)
            #Add xml namespace
            soup.html["xmlns"] = "http://www.w3.org/1999/xhtml"
        except:
            continue

        # Insert header if it is not already there
        body = soup.html.body
        if not (ascii_chars(readable_title) in ascii_chars(readable_article)
                ):  # TODO: FIXME, this does not work yet, e.g., for ZEIT
            h1 = Tag(soup, "h1", [("class", "title")])
            h1.insert(0, escape(readable_title))
            body.insert(0, h1)

        if (links == None):
            refs = body.findAll('a')
            for x in refs:
                try:
                    tag = Tag(soup, 'span', [("class", "link-removed")])
                    tag.insert(0, x.text)
                    body.a.replaceWith(tag)
                except:
                    pass

        #Add stylesheet path
        head = soup.find('head')
        if head is None:
            head = Tag(soup, "head")
            soup.html.insert(0, head)
        link = Tag(soup, "link", [("type", "text/css"), ("rel", "stylesheet"),
                                  ("href", "stylesheet.css")])
        head.insert(0, link)
        article_title = Tag(soup, "title")
        article_title.insert(0, escape(readable_title))
        head.insert(1, article_title)

        # If we do not have an author for the book, then use the URL hostname of the first article
        if (author == None):
            author = str(urlparse.urlparse(url).hostname.replace("www.",
                                                                 "")) or ''

        # If we do not have a title for the book, then use the date
        if (title == None):
            if (len(urls) > 1):
                title = author + " " + str(time.strftime('%d.%m.%Y'))
                # title = readable_title
            else:
                title = readable_title

        if (images != None):
            #Download images
            for j, image in enumerate(soup.findAll("img")):
                #Convert relative urls to absolute urls
                imgfullpath = urlparse.urljoin(url, image["src"])
                #Remove query strings from url
                imgpath = urlparse.urlunsplit(
                    urlparse.urlsplit(imgfullpath)[:3] + (
                        '',
                        '',
                    ))
                print "    Downloading image: %s %s" % (j + 1, imgpath)
                imgfile = os.path.basename(imgpath)
                os.system("mogrify -resize 1200x1200 -quality 50 " + imgpath)
                filename = 'article_%s_image_%s%s' % (
                    i + 1, j + 1, os.path.splitext(imgfile)[1])
                if imgpath.lower().startswith("http"):
                    epub.writestr('OEBPS/images/' + filename,
                                  urllib.urlopen(imgpath).read())
                    image['src'] = 'images/' + filename
                    manifest += '<item id="article_%s_image_%s" href="images/%s" media-type="%s"/>\n' % (
                        i + 1, j + 1, filename,
                        mimetypes.guess_type(filename)[0])

        if (footer != None):
            p = Tag(soup, "p", [("class", "source-url")])
            p.insert(0, url)
            body.append(p)

        epub.writestr('OEBPS/article_%s.html' % (i + 1), str(soup))

        if (icon == None):
            icons_xpath = '//link[contains(@rel, "icon")][contains(@href, ".png")]/@href'
            tree = lxml.html.fromstring(
                html)  # FIXME: This fails on some encodings!
            # FIXME: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
            results = tree.xpath(icons_xpath)
            try:
                icon = urlparse.urljoin(url, sorted(results, reverse=False)[0])
                cpath = "images/cover"
                ctype = "image/png"
                print(icon)
            except:
                pass

    # This should never happen, but if it does, .encode crashes; hence we catch it
    if (title == None):
        title = "Unknown"
    if (author == None):
        author = "Unknown"

    #Metadata about the book
    info = dict(title=title.encode('ascii', 'xmlcharrefreplace'),
                author=author.encode('ascii', 'xmlcharrefreplace'),
                date=time.strftime('%Y-%m-%d'),
                lang=language,
                front_cover=cpath,
                front_cover_type=ctype)

    epub.writestr('OEBPS/cover.html', cover_tpl % info)
    if cover is not None:
        epub.write(os.path.abspath(cover),
                   'OEBPS/images/cover' + os.path.splitext(cover)[1],
                   zipfile.ZIP_DEFLATED)
    else:
        if (icon != None):
            epub.writestr('OEBPS/images/cover',
                          urllib.urlopen(icon).read(), zipfile.ZIP_DEFLATED)

    info['manifest'] = manifest
    info['spine'] = spine
    info['toc'] = toc

    # Finally, write the index and toc
    epub.writestr('OEBPS/stylesheet.css', stylesheet_tpl)
    epub.writestr('OEBPS/Content.opf', index_tpl % info)
    epub.writestr('OEBPS/toc.ncx', toc_tpl % info)
    return outfile

Beispiel #52

0

Datei anzeigen

def hack_rst_examples(bs):
    example_index = 1
    for pretag in bs.findAll('pre', {'class':'literal-block'}):
        table = Tag(bs, "table", [('class', 'example')])
        tbody = Tag(bs, "tbody")
        table.insert(0, tbody)
        tr = Tag(bs, "tr")
        tbody.insert(0, tr)
        td = Tag(bs, "td", [('class', 'example_in')])
        tr.insert(0, td)
    
        pretag.replaceWith(table)
        td.insert(0, pretag)
    
        gentag = Tag(bs, "td", [('class', 'example_out')])
    
        tcall = Tag(bs, "span", [('id', ('gen_%d' % example_index))])
        tcall.insert(0, htmldecode(pretag.contents[0]))
    
        gentag.insert(0, tcall)
    
        tr.insert(1, gentag)
        example_index += 1
    
    
    head = bs.html.head
    
    for src in script_includes:
        head.insert(-1, Tag(bs, 'script', [('type', 'text/javascript'),
                                           ('src', src)]))
        
    # now insert some script to execute all the examples
    initscript = Tag(bs, 'script', [('type', 'text/javascript')])
    
    initscript.insert(0, '''
    function run_demos() {
    
    %s
    }
    ''' % ''.join(["mjt.run('gen_%d');\n" % ei for ei in range(1,example_index)]))
    head.insert(-1, initscript)

    bs.html.body.attrs.append(('onload', 'run_demos();'))
    
    print '%d examples found' % (example_index-1)

Beispiel #53

0

Datei anzeigen

    def SetupHuluLibrary(self):
        print "Trying to add Hulu source paths..."
        source_path = os.path.join(xbmc.translatePath('special://profile/'),
                                   'sources.xml')
        dialog = xbmcgui.Dialog()

        self.CreateDirectory(MOVIE_PATH)
        self.CreateDirectory(TV_SHOWS_PATH)

        try:
            file = open(source_path, 'r')
            contents = file.read()
            file.close()
        except:
            dialog.ok(
                "Error",
                "Could not read from sources.xml, does it really exist?")
            file = open(source_path, 'w')
            content = "<sources>\n"
            content += "    <programs>"
            content += "        <default pathversion=\"1\"></default>"
            content += "    </programs>"
            content += "    <video>"
            content += "        <default pathversion=\"1\"></default>"
            content += "    </video>"
            content += "    <music>"
            content += "        <default pathversion=\"1\"></default>"
            content += "    </music>"
            content += "    <pictures>"
            content += "        <default pathversion=\"1\"></default>"
            content += "    </pictures>"
            content += "    <files>"
            content += "        <default pathversion=\"1\"></default>"
            content += "    </files>"
            content += "</sources>"
            file.close()

        soup = BeautifulSoup(contents)
        video = soup.find("video")

        if len(soup.findAll(text="Hulu Movies")) < 1:
            movie_source_tag = Tag(soup, "source")
            movie_name_tag = Tag(soup, "name")
            movie_name_tag.insert(0, "Hulu Movies")
            movie_path_tag = Tag(soup, "path")
            movie_path_tag['pathversion'] = 1
            movie_path_tag.insert(0, MOVIE_PATH)
            movie_source_tag.insert(0, movie_name_tag)
            movie_source_tag.insert(1, movie_path_tag)
            video.insert(2, movie_source_tag)

        if len(soup.findAll(text="Hulu Subscriptions")) < 1:
            tvshow_source_tag = Tag(soup, "source")
            tvshow_name_tag = Tag(soup, "name")
            tvshow_name_tag.insert(0, "Hulu Subscriptions")
            tvshow_path_tag = Tag(soup, "path")
            tvshow_path_tag['pathversion'] = 1
            tvshow_path_tag.insert(0, TV_SHOWS_PATH)
            tvshow_source_tag.insert(0, tvshow_name_tag)
            tvshow_source_tag.insert(1, tvshow_path_tag)
            video.insert(2, tvshow_source_tag)

        string = ""
        for i in soup:
            string = string + str(i)

        file = open(source_path, 'w')
        file.write(str(soup))
        file.close()
        print "Source paths added!"

Beispiel #54

0

Datei anzeigen

Datei: 07-creating-doc.py Projekt: ithenis/python-course

from BeautifulSoup import BeautifulSoup, Tag, NavigableString


soup =  BeautifulSoup()

tag1 = Tag(soup, "person")
tag2 = Tag(soup, "name", [("first","John"),("last","Smith")])
tag3 = Tag(soup, "location", [("country", "uk")])
soup.insert(0, tag1)
tag1.insert(0, tag2)
tag1.insert(1, tag3)
print soup
text = NavigableString("John Gary Smith")
tag2.insert(0, text)
print soup.prettify()


1

Beispiel #55

0

Datei anzeigen

#
items = soup.findAll("p", {"class": "g"})

for item in items:
    #	print div
    wspan = item.find("span", {"class": "w"})
    #	print wspan
    # Hmm, this should never happen, but it does!
    if not wspan:
        continue
    a = wspan.find('a')
    if not a:
        continue
    if not a['href']:
        continue

    cul = Tag(soup, "a")

    cul['href'] = "/posturl?url=" + urllib.quote(a['href'])
    img = Tag(soup, "img")
    img['src'] = "http://static.citeulike.org/favicon.gif"
    img['style'] = "border:0"
    cul.insert(0, img)
    wspan.insert(99, cul)
#	print wspan.prettify()

if testing == 0:
    print soup
else:
    print soup.prettify()

Beispiel #56

0

Datei anzeigen

    except:
        print "Can't read the dictionary file", dictionary_file_name
        exit()

html_index_data = open('../static/index_en.html').read()
#shutil.copyfile(, './index_%s.html' % translate_to)

index_soup = BeautifulSoup(html_index_data)
for link in index_soup.findAll('a'):
    article_link = unicode(normalize_title(link['href'][6:]))

    if article_link in trans_dict:
        #try:
        link['href'] = '/wiki/%s' % trans_dict[article_link]
        link['title'] = trans_dict[article_link]
        link.find(text=link.text).replaceWith(trans_dict[article_link])
        #except:
        #    print "translation not found"
        #    pass
    else:
        link['class'] = 'TRANS_PENDING'

style = Tag(index_soup, "style")
index_soup.html.head.insert(1, style)
style_text = NavigableString('.TRANS_PENDING {background-color:red;}')
style.insert(0, style_text)

translated_index = open('./index_%s.html' % translate_to, 'w')
translated_index.write(str(index_soup))
translated_index.close()

Beispiel #57

0

Datei anzeigen

def sunset_embed(body, request=False):
    # Moved the import down here to avoid a circular import
    from sunset.models import image
    self_closing = [
        'sunset',
    ]

    if body and "<sunset" in body:
        body_raw = BeautifulSoup(body, selfClosingTags=self_closing)
        imglist = body_raw.findAll('sunset')

        for imgtag in imglist:
            err = 'Unknown error parsing Sunset embed tag'
            new_tag = ''
            img_pk = imgtag.get('id', False)
            cur_type = imgtag.get('type', 'icon')
            if img_pk:
                img_check = image.objects.filter(pk=int(img_pk)).filter(
                    access_query(request)).select_related('cat')

                if img_check:
                    cur_img = img_check.first()
                    asset_check = cur_img.assets.filter(type=cur_type)

                    if asset_check:
                        cur_asset = asset_check.first()
                        new_tag = BeautifulSoup(selfClosingTags=self_closing)
                        new_a = Tag(new_tag, 'a')
                        new_img = Tag(new_tag, 'img')

                        new_a['class'] = 'sunset_embed sunset_%s' % cur_type
                        new_a['href'] = cur_img.get_absolute_url()
                        new_a['title'] = cur_img

                        new_img['alt'] = cur_img
                        new_img['title'] = cur_img

                        new_img['src'] = cur_asset.get_url()

                        new_tag.insert(0, new_a)
                        new_a.insert(0, new_img)
                        err = False

                    else:
                        err = 'Sunset image asset type specified in embed tag was not found'

                else:
                    err = 'Sunset image specified in embed tag was not found'

            else:
                err = 'Invalid or missing image ID in Sunset embed tag'

            if err:
                imgtag.replaceWith(
                    Comment('%s.  Original was:  %s' % (err, imgtag)))
            else:
                imgtag.replaceWith(new_tag)

        return unicode(body_raw)

    else:
        # Nothing to do.
        return body

Beispiel #58

0

Datei anzeigen

Datei: xbmclibrary.py Projekt: shaifbari/XBMC-Amazon.com-Prime-Streaming

def LIST_MOVIES():
    if (common.addon.getSetting('enablelibraryfolder') == 'true'):
        SetupAmazonLibrary()
    elif (common.addon.getSetting('customlibraryfolder') <> ''):
        CreateDirectory(MOVIE_PATH)
        CreateDirectory(TV_SHOWS_PATH)
    import movies as moviesDB
    movies = moviesDB.loadMoviedb(favorfilter=True)
    for asin, movietitle, url, poster, plot, director, writer, runtime, year, premiered, studio, mpaa, actors, genres, stars, votes, TMDBbanner, TMDBposter, TMDBfanart, isprime, watched, favor, TMDB_ID in movies:
        CreateStreamFile(movietitle, url, MOVIE_PATH)
        soup = BeautifulSoup()
        movie = Tag(soup, "movie")
        soup.insert(0, movie)
        movie.insert(0, createElement('title', movietitle + ' (Amazon)'))
        if year:
            movie.insert(1, createElement('year', str(year)))
        if premiered:
            movie.insert(1, createElement('premiered', premiered))
        if plot:
            movie.insert(2, createElement('plot', plot))
        if runtime:
            movie.insert(2, createElement('runtime', runtime))
        if votes:
            movie.insert(3, createElement('votes', str(votes)))
        if stars:
            movie.insert(4, createElement('rating', str(stars)))
        if director:
            movie.insert(5, createElement('director', director))
        if studio:
            movie.insert(6, createElement('studio', studio))
        if poster:
            movie.insert(7, createElement('thumb', poster))
        if mpaa:
            movie.insert(8, createElement('mpaa', mpaa))
        u = sys.argv[0]
        u += '?url="' + urllib.quote_plus(url) + '"'
        u += '&mode="play"'
        u += '&name="' + urllib.quote_plus(movietitle) + '"'
        utrailer = u + '&sitemode="PLAYTRAILER"'
        movie.insert(9, createElement('trailer', utrailer))
        fileinfo = createElement('fileinfo', '')
        streamdetails = createElement('streamdetails', '')
        audio = createElement('audio', '')
        audio.insert(0, createElement('channels', '2'))
        audio.insert(1, createElement('codec', 'aac'))
        streamdetails.insert(0, audio)
        video = createElement('video', '')
        video.insert(0, createElement('codec', 'h264'))
        video.insert(1, createElement('height', '400'))
        video.insert(2, createElement('width', '720'))
        video.insert(4, createElement('scantype', 'Progressive'))
        streamdetails.insert(1, video)
        fileinfo.insert(0, streamdetails)
        movie.insert(10, fileinfo)
        index = 10
        if genres:
            for genre in genres.split(','):
                index += 1
                movie.insert(index, createElement('genre', genre))
        if actors:
            for actor in actors.split(','):
                if actor <> None:
                    index += 1
                    actortag = createElement('actor', '')
                    actorname = createElement('name', actor)
                    actortag.insert(0, actorname)
                    movie.insert(index, actortag)
        movieNFO = os.path.join(MOVIE_PATH, movietitle + '.nfo')
        file = open(movieNFO, 'w')
        file.write(str(soup))
        file.close()

Beispiel #59

0

Datei anzeigen

    chart_url += str(value_killed)
    chart_url += ','
    chart_url += str(value_killer)
    chart_url += '&chtt=Twitter+Analysis+Chart'

    #http://chart.apis.google.com/chart?chxl=0:|Policeman+Killed|Killed+by+police&chxs=0,676767,11.5,0,lt,676767&chxt=x&chbh=a,100&chs=300x200&cht=bvg&chco=FF0000&chd=t:30,70&chtt=Twitter+Analysis+Chart

    # Now, create a HTML page with the information
    #  The paga is simple: head with title, body with a big div holding an image (the chart) and 5 additional divs with text
    htmldata = BeautifulSoup()

    htmltag = Tag(htmldata, "html")
    headtag = Tag(htmldata, "head")

    titletag = Tag(htmldata, "title")
    titletag.insert(0, NavigableString('Twitter Stream Analysis Example'))

    bodytag = Tag(htmldata, "body")

    imgtag = Tag(htmldata, "img")
    imgtag['src'] = chart_url

    divtag_wrap = Tag(htmldata, "div")
    divtag_t1 = Tag(htmldata, "div")
    divtag_t1.insert(
        0,
        NavigableString('Total sentences analyzed: ' + str(total_sentences) +
                        ' taken from 400 public tweets'))

    divtag_t2 = Tag(htmldata, "div")
    divtag_t2.insert(