Beispiel #1
0
 def CreateBody(self):
     '''Создаем body'''
     body = Tag(self.soup, 'body')
     totalTagsCount = random.randint(150, 400)
     
     '''Создаем структуру шаблона из тегов div'''
     for _ in range(random.randint(1, 3)):
         body.append(self.CreateDiv())
     divsTotalCount = totalTagsCount * random.randint(15, 25) / 100
     while divsTotalCount > 0:
         divsLowLevelList = [item for item in body.findAll('div') if len(item.findAll(True)) == 0]
         divToExtend = random.choice(divsLowLevelList)
         for _ in range(random.randint(2, 4)):
             divToExtend.append(self.CreateDiv())
             divsTotalCount -= 1
     
     '''Получаем список тегов div разных уровней'''
     divsList = body.findAll('div')
     divsTopLevelList = [item for item in body.findAll('div', recursive=False)]
     divsLowLevelList = [item for item in divsList if len(item.findAll(True)) == 0]
     divsMidLevelList = [item for item in divsList if item not in divsTopLevelList and item not in divsLowLevelList]
     
     '''Проставляем им атрибуты'''
     for item in divsTopLevelList:
         self.AppendIds(item, 95, 1)
     for item in divsMidLevelList:
         self.AppendIds(item, 20, 75)
     for item in divsLowLevelList:
         self.AppendIds(item, 30, 65)
         
     '''Создаем наполнение главных блоков'''
     divHeader = divsLowLevelList.pop(random.randint(0, 2))
     divHeader.string = '[header]'
     divMain = divsLowLevelList.pop(random.randint(1, 3))
     divMain.string = '[main]'
     divLinks = divsLowLevelList.pop(random.randint(-3, -1))
     divLinks.string = '[links]'
     divFooter = divsLowLevelList.pop(random.randint(-3, -1))
     divFooter.string = '[footer]'
     
     '''Создаем меню, сайдбары и формы'''
     for _ in range(random.randint(1, 2)):
         menu = divsLowLevelList.pop()
         menu.append(self.CreateList(0))
     for _ in range(random.randint(1, 2)):
         sidebar = divsLowLevelList.pop()
         self.CreateSidebar(sidebar)
     for _ in range(random.randint(0, 2)):
         form = divsLowLevelList.pop()
         form.append(self.CreateForm())
     
     '''Создаем прочее наполнение'''
     random.shuffle(divsLowLevelList)
     for _ in range(random.randint(2, 5)):
         div = divsLowLevelList.pop()
         self.CreateOthers(div)
     self.soup.html.append(body)
def get_last_3(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    ul = Tag(soup, "ul")
    for tr in table.findAll("tr"):
        td = tr.findAll("td")
        li = Tag(soup, "li")
        for el in td[3:]:
            if loop != 3:
                try:
                    text = ''.join(el.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != ' ':
                        el.name = "span"
                        if loop != 2: el.append(' - ')
                        li.append(el)
                except:
                    pass
            else:
                break    
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    return enclose
def get_first_three(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    for tr in table.findAll("tr"):
        li = Tag(soup, "li")
        for td in tr.findAll("td"):
            if loop != 3:
                try:
                    text = ''.join(td.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != ' ':
                        td.name = "span"
                        if first == 1:
                            first = 0
                            enclose.append(td)
                        else:
                            if loop != 2: td.append(' - ')
                            li.append(td)
                except:
                    pass
            else:
                break    
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    title = enclose.find("span")
    enclose.find("span").replaceWith("")
    enclose.name = "ul"
    div = Tag(soup, "div")
    div.append(title)
    div.append(enclose)
    return div
def get_last_3(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    ul = Tag(soup, "ul")
    for tr in table.findAll("tr"):
        td = tr.findAll("td")
        li = Tag(soup, "li")
        for el in td[3:]:
            if loop != 3:
                try:
                    text = ''.join(el.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != ' ':
                        el.name = "span"
                        if loop != 2: el.append(' - ')
                        li.append(el)
                except:
                    pass
            else:
                break
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    return enclose
def get_first_three(soup, table):
    loop = 0
    first = 1
    enclose = Tag(soup, "div")
    for tr in table.findAll("tr"):
        li = Tag(soup, "li")
        for td in tr.findAll("td"):
            if loop != 3:
                try:
                    text = ''.join(td.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != ' ':
                        td.name = "span"
                        if first == 1:
                            first = 0
                            enclose.append(td)
                        else:
                            if loop != 2: td.append(' - ')
                            li.append(td)
                except:
                    pass
            else:
                break
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    title = enclose.find("span")
    enclose.find("span").replaceWith("")
    enclose.name = "ul"
    div = Tag(soup, "div")
    div.append(title)
    div.append(enclose)
    return div
Beispiel #6
0
 def fix_heading(heading, tags):
     '''
     Remove paragraphs with no strings.
     Remove non-special headings that don't start with a paragraph.
     Remove lists from non-special headings.
     '''
     SPECIAL = ['Books', 'Works', 'Bibliography', 'External links',
                'Further reading']
     tags = [tag for tag in tags if tag is not None and
                 tag.name!='p' or tag.renderContents(None).strip()]
     special = False
     heading_text = tagtext(heading)
     for word in SPECIAL:
         if word.lower() in heading_text.lower():
             special = True
     if heading_text == 'External links and references':
         set_heading_text(heading, 'External links')
     # Shorten lists (even special ones).
     # The motivation is that some pages like to list reams of crap,
     # usually in bibliographies, but in other things too.
     found_lis = 0
     MAX_ITEMS = 10  # per headed section
     for tag in list(tags):
         if tag.name in ('ul', 'ol'):
             for li in tag.findAll('li', recursive=False):
                 found_lis += 1
                 if found_lis > MAX_ITEMS:
                     li.extract()
     # Remove any now-empty uls and ols.
     # Harder than it sounds, due to nested lists.
     temp = Tag(soup, 'p')
     for tag in tags:
         temp.append(tag)
     for tag in temp.findAll(('ul', 'ol')):
         if not tag.findAll(('ul', 'ol', 'li')):
             tag.extract()
     tags = temp.contents
     if found_lis > MAX_ITEMS:
         # Add " (some omitted)" to heading
         if heading_text:
             heading_text = heading_text.replace(' (incomplete)', '')
             if context['srcurl'].startswith('http:'):
                 heading_text += ' (some <a href="%s">omitted</a>)' % context['srcurl']
             else:
                 heading_text += ' (some omitted)'  # no "relative" links
             set_heading_text(heading, heading_text)
     if not special:
         if heading is not None:
             # Remove non-special headings which don't start with a paragraph.
             if not tags or tags[0].name != 'p':
                 return drop_heading(heading)
             # Remove non-special headings containing lists.
             for tag in tags:
                 if tag.name in ('ul', 'ol'):
                     return drop_heading(heading)
         else:
             # Remove lists from None (before first heading, if any).
             tags = [tag for tag in tags if tag.name not in ('ul', 'ol')]
     return (heading, tags)
Beispiel #7
0
 def CreateList(self, probNested):
     '''Создаем список ul, вложенный с заданной вероятностью'''
     ul = Tag(self.soup, 'ul')
     self.AppendIds(ul, 50, 30)
     liClass = self.GenerateClass(0)
     for _ in range(random.randint(3, 7)):
         ul.append(self.CreateListItem(liClass))
     if self._Probability(probNested):
         liNestedList = ul.findAll('li')
         random.shuffle(liNestedList)
         liNestedList = liNestedList[:random.randint(1, 4)]
         for liNested in liNestedList:
             liNested.append(self.CreateList(0))
     for li in ul.findAll('li'):
         if len(li.findAll(True)) == 0:
             li.append(self.CreateLinkText())
     return ul
Beispiel #8
0
 def CreateList(self, probNested):
     '''Создаем список ul, вложенный с заданной вероятностью'''
     ul = Tag(self.soup, 'ul')
     self.AppendIds(ul, 50, 30)
     liClass = self.GenerateClass(0)
     for _ in range(random.randint(3, 7)):
         ul.append(self.CreateListItem(liClass))
     if self._Probability(probNested):
         liNestedList = ul.findAll('li')
         random.shuffle(liNestedList)
         liNestedList = liNestedList[:random.randint(1, 4)]
         for liNested in liNestedList:
             liNested.append(self.CreateList(0))
     for li in ul.findAll('li'):
         if len(li.findAll(True)) == 0:
             li.append(self.CreateLinkText())
     return ul
def linearize_cols_1_4(soup, table):
    if table.get('id') == "linearize-cols-1-4":
        div = Tag(soup, "ul")
        for i in range(4):
            for tr in table.findAll("tr"):
                td = tr.find("td")
                tr.find("td").replaceWith("")
                div.append(td)
        list_a = div.findAll("a")
        composite_list = [list_a[x:x+4] for x in range(0, len(list_a), 4)]
        ul = Tag(soup, "ul")
        for lista in composite_list:
            li = Tag(soup, "li")
            for a in lista:
                if a == lista[-1]:
                    a = BeautifulSoup(a.prettify())
                else:
                    a = BeautifulSoup(a.prettify() + '<span> | </span>')
                li.append(a)
            ul.append(li)
        table.replaceWith(ul)
def linearize_cols_1_4(soup, table):
    if table.get('id') == "linearize-cols-1-4":
        div = Tag(soup, "ul")
        for i in range(4):
            for tr in table.findAll("tr"):
                td = tr.find("td")
                tr.find("td").replaceWith("")
                div.append(td)
        list_a = div.findAll("a")
        composite_list = [list_a[x:x + 4] for x in range(0, len(list_a), 4)]
        ul = Tag(soup, "ul")
        for lista in composite_list:
            li = Tag(soup, "li")
            for a in lista:
                if a == lista[-1]:
                    a = BeautifulSoup(a.prettify())
                else:
                    a = BeautifulSoup(a.prettify() + '<span> | </span>')
                li.append(a)
            ul.append(li)
        table.replaceWith(ul)
def get_first_two(soup, table):
    loop = 0
    enclose = Tag(soup, "div")
    for tr in table.findAll("tr"):
        li = Tag(soup, "li")
        for td in tr.findAll("td"):
            if loop != 2:
                try:
                    text = ''.join(td.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != '&nbsp;':
                        td.name = "span"
                        if loop != 1: td.append(' - ')
                        li.append(td)
                except:
                    pass
            else:
                break    
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    return enclose
def get_first_two(soup, table):
    loop = 0
    enclose = Tag(soup, "div")
    for tr in table.findAll("tr"):
        li = Tag(soup, "li")
        for td in tr.findAll("td"):
            if loop != 2:
                try:
                    text = ''.join(td.findAll(text=True))
                    text = text.strip()
                    if text != '' and text != '&nbsp;':
                        td.name = "span"
                        if loop != 1: td.append(' - ')
                        li.append(td)
                except:
                    pass
            else:
                break
            loop += 1
        loop = 0
        if ''.join(li.findAll(text=True)) != '':
            enclose.append(li)
    return enclose
        anchorLink = column.find('a')['href']
        fileName = anchorLink.split('/')[-1].replace('.shtml', '.html')
        # 処理対象のバンドのみ処理する。指定がない場合はすべて。
        if not TARGET_BAND or fileName[:-5] == TARGET_BAND:
            bandBreadCrumbsSoup = BeautifulSoup()
            parentUlTag = createParentUlTag(bandBreadCrumbsSoup)
            bandListTag = column.find('li')
            albumUlTag = Tag(bandBreadCrumbsSoup, 'ul')
            bandListTag.append(albumUlTag)
            for childColumn in column.findAll('ul', attrs = {'class' : 'child-column'}):
                albumUlTag.append(childColumn.find('li'))
            parentUlTag.append(bandListTag)
            # ファイル生成
            resultFile = open('/'.join([PARENT_DIR, 'common/bread_crumbs', fileName]), 'w')
            resultFile.write(parentUlTag.prettify())
            resultFile.close()
            print "write %s" % fileName

            # アルバムファイルのパンくずリスト
            for childColumn in albumUlTag.findAll('li'):
                childAnchorLink = childColumn.find('a')['href']
                albumParentUlTag = copy.deepcopy(parentUlTag)
                albumParentUlTag.append(copy.deepcopy(childColumn))
                # ファイル生成
                splitList = childAnchorLink.split('/')
                childFileName = '/'.join([splitList[-3], splitList[-1]]).replace('.shtml', '.html')
                resultFile = open('/'.join([PARENT_DIR, 'common/bread_crumbs', childFileName]), 'w')
                resultFile.write(albumParentUlTag.prettify())
                resultFile.close()
                print "write %s" % childFileName
Beispiel #14
0
	def parse_html(self):
		title = None
		body = None
		bodysoup = None

		nav_title = None
		meta_title = None
		content_title = None
		
		self.attachments = []

		if (not self.debug_path or self.debug_path == self.path):

			# Remove nbsps
			self.html = self.html.replace("&nbsp;", "")
			
			# Remove attributes from closing tags (!)
			self.html = re.sub(r"</([a-zA-Z]+) [^>]+>", r"</\1>", self.html)
			
			m = RE_BODY.search(self.html)
			if m and m.lastindex == 1:
				bodysoup = BeautifulSoup(m.group(1), convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
			else:
				try:
					bodysoup = BeautifulSoup(self.html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).html.body
					if not bodysoup:
						fixed_html = self.html.replace("</head>", "</head><body>")
						bodysoup = BeautifulSoup(fixed_html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).html.body
				except AttributeError:
					pass
	
			if not bodysoup:
				raise ImportError("No body")
	
			if self.debug_path == self.path:
				print "\n\n========= DEBUG =========\n"

			# Remove comments
			for comment in (bodysoup.findAll(text=lambda text:isinstance(text, Comment))):
				self.debug("Removed comment: <!-- %s -->" % comment)
				comment.extract()
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Convert header divs into h1, h2
			h1_found = False
			for tag in bodysoup.findAll("div"):
				if tag.get("class") == "BL-otsikko1" and not h1_found:
					h1_found = True
					tag.name = "h1"
					self.debug("Converted into H1: %s" % tag)
				elif tag.get("class") == "BL-otsikko2" or \
					(tag.get("class") == "BL-otsikko1" and h1_found):
					tag.name = "h2"
					self.debug("Converted into H2: %s" % tag)
				elif tag.get("class") == "BL-leivanmurut":
					tag.extract()
					self.debug("Removed breadcrumbs")
				else:
					tag.hidden = True
			bodysoup = BeautifulSoup(bodysoup.prettify())

			# Remove unwanted elements
			for tag in bodysoup.findAll(["style", "link"]):
				tag.extract()
				self.debug("Removed %s" % tag)
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Hide unnecessary elements
			for tag in bodysoup.findAll(["span", "div", "body", "font"]):
				self.debug("Set hidden: %s" % tag.name)
				tag.hidden = True
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Reformat forms
			for form in bodysoup.findAll("form"):
				pass
			
			# Hide non-semantic tables
			for table in bodysoup.findAll("table"):
				if table.get("border") != "1" and len(table.findAll("tr", recursive=False)) < 100 and not has_ancestor(table, "form"):
					table.hidden = True
					for tr in table.findAll("tr", recursive=False):
						tr.hidden = True
						for td in tr.findAll(["td", "th"], recursive=False):
							text = td.find(text=re.compile("[^\s]+", re.U))
							if text and text.strip() != "":
								td.name = "p"
								self.debug("Converted th/td into p: %s" % td)
							else:
								td.hidden = True
					self.debug("Hid non-semantic table")
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Remove orphan th/td/tr
			for el in bodysoup.findAll(["tr", "td", "th"]):
				if el.parent.name != "table" or (el.parent.parent and el.parent.parent.name != "table"):
					if el.name in ("td", "td"):
						text = el.find(text=re.compile("[^\s]+", re.U))
						if text and text.strip() != "":
							el.name = "p"
							self.debug("Converted td/th into p: %s" % el)
						else:
							el.hidden = True
							self.debug("Hid orphan %s: %s" % (el.name, el))
					else:
						el.hidden = True
						self.debug("Hid orphan %s: %s" % (el.name, el))
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Wrap NavigableStrings in td into p
			for tag in bodysoup.findAll(text=lambda text:isinstance(text, NavigableString)):
				if tag.parent.name == "td" and tag.strip() != "":
					p = Tag(bodysoup, "p")
					p.insert(0, "%s" % tag)
					tag.replaceWith(p)
					self.debug("Moved from td into p: %s" % tag)
			bodysoup = BeautifulSoup(bodysoup.prettify())

			# Convert "loose" NavigableStrings into paragraphs
			for tag in bodysoup.findAll(text=lambda text:isinstance(text, NavigableString)):
				if len(tag.strip()) > 10 and tag.parent.name == "[document]":
					p = Tag(bodysoup, "p")
					p.insert(0, "%s" % tag)
					tag.replaceWith(p)
					self.debug("Moved loose string into p: %s" % tag)
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Move NavigableStrings after list into p before moving lists
			for ul in bodysoup.findAll("ul"):
				if ul.parent.name == "p":
					next = ul.nextSibling
					if isinstance(next, NavigableString):
						p = Tag(bodysoup, "p")
						p.insert(0, "%s" % next)
						next.replaceWith(p)
						self.debug("Moved NavigableString after list into p: %s" % p)
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Move blocks outside paragraphs
			for block in bodysoup.findAll(["p", "ul", "h1", "h2"]):
				parent = block.parent
				if parent.name == "p":
					if block.name in ("h1", "h2"):
						parent.parent.insert(parent.parent.index(parent), block)
						self.debug("Moved %s before p" % block.name)
					else:
						parent.parent.insert(parent.parent.index(parent) + 1, block)
						self.debug("Moved %s after p" % block.name)
			
			# Delete depracated attributes
			for tag in bodysoup.findAll():
				for attr in ("align", "valign", "class", "style", "border", "vspace", "hspace", "cellpadding", "cellspacing"):
					del(tag[attr])
				for attr in ("width", "height"):
					if tag.name != "img":
						del(tag[attr])
				for attr in ("colspan", "rowspan"):
					if not tag.name in ("td", "tr"):
						del(tag[attr])
			bodysoup = BeautifulSoup(bodysoup.prettify())			
				
			# Import images
			for tag in bodysoup.findAll("img"):
				src = tag.get("src")
				if src and src.endswith(".gif") and src.find("/tyhja-") != -1:
					tag.extract()
				elif src and not src.startswith("http://"):
					img_path = os.path.dirname(os.path.join(self.source_dir, self.source_path)) + "/" + src
					if os.path.exists(img_path) and os.path.isfile(img_path):
						img_title = tag.get("title") or tag.get("alt") or ""
						img = Image.create_from_file(img_path, img_title[0:100])
						img.tmp_orig_path = src
						img.save()
						tag["src"] = img.file.url
						self.debug("Imported image: %s" % tag["src"])
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Import external files into Attachment models
			for a in bodysoup.findAll("a", href=re.compile(".+")):
				href = a.get("href")
				path, ext = os.path.splitext(href)
				if not href.startswith("http://") and ext != "" and not ext in (".html", ".shtml", ".php", ".jpg", ".gif", ".png"):
					if href.startswith("/"):
						abspath = self.source_dir + href
					else:
						abspath = self.source_dir + os.path.dirname(self.source_path) + "/" + href
					if os.path.exists(abspath):
						self.debug("Found attachment: %s" % abspath)
						self.attachments.append(abspath) # store for later import as we don't have page id yet
											
			
			# Remove bad linebreaks
			for br in bodysoup.findAll("br"):
				if br.parent.name == "p":
					for sib in (br.previousSibling, br.nextSibling):
						 if not sib or (isinstance(sib, NavigableString) and sib.strip() == ""):
							self.debug("Removed linebreak at (%s, %s)" % (br.parent.index(br), br.parent))
							br.extract()
							break
				elif br.parent.name == "[document]":
					br.extract()
			bodysoup = BeautifulSoup(bodysoup.prettify())

			# Clean up paragraphs
			for p in bodysoup.findAll("p"):
				non_sentence = lambda str:str != None and not str.strip().endswith(".") and 3 < len(str) < 30
				# Remove empty
				if p.string and p.string.strip() == "":
					self.debug("Removed empty p at (%s, %s)" % (p.parent.name, p.parent.index(p)))
					p.extract()
				# Hide if contains only tag(s)
				elif not p.findAll(text=re.compile(r"[^\s]+", re.U)):
					self.debug("Hid p with no text: %s" % p)
					p.hidden = True
				# Convert short one-liners into h3
				elif non_sentence(p.string) or (len(p.findAll(text=re.compile("[^\s]+", re.U))) == 1 and non_sentence(p.contents[0].string)):
					p.name = "h3"
					self.debug("Converted p into h3: %s" % p)
				# Remove bad styling
				else:
					tags = p.findAll(recursive=False)
					if len(tags) == 1 and tags[0].name in ("b", "u", "i"):
						#if not tags[].previousSibling and not el.nextSibling:
						#	el.hidden = True
						#self.debug("Hid %s from p, only child" % el.name)
						if not p.findAll(text=re.compile("[^\s]+", re.U), recursive=False):
							#print "!!! %s" % p
							tags[0].hidden = True
							self.debug("Hid %s from p, bad styling: %s" % (tags[0].name, p))
			bodysoup = BeautifulSoup(bodysoup.prettify())

			# Remove redundant information
			for text in bodysoup.findAll(text=re.compile(r"^\s*pdf-tiedosto [0-9]+ KB\s+$")):
				self.debug("Removed text: %s" % text)
				text.extract()
			bodysoup = BeautifulSoup(bodysoup.prettify())
				
			# Clean up headings
			for h in bodysoup.findAll(["h1", "h2", "h3", "h4", "h5", "h6"]):
				for el in h.findAll():
					# Remove styling elements (u, b, i, etc)
					if isinstance(el, Tag) and el.name != "a":
						el.hidden = True
						self.debug("Heading clean-up, hid %s in %s" % (el.name, h))
				try:
					# Move h1 at first
					if h.name == "h1" and h.parent.index(h) != 1:
							h.parent.insert(1, h)
							self.debug("Moved %s at first" % h.name)
					# Convert any heading at the beginning of document into h1
					elif h.name != "h1" and h.parent.name == "[document]" and not h.previousSibling:
						self.debug("Converted into h1: %s" % h)
						h.name = "h1"
				except IndexError:
					pass
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Convert internal links
			for a in bodysoup.findAll("a"):
				href = a.get("href")
				if href and not href.startswith("http://") and href.endswith(".shtml"):
					a["href"] = href.replace("/index.shtml", "").replace(".shtml", "").replace("_", "-")
					self.debug("Fixed link: %s -> %s" % (href, a["href"]))
			bodysoup = BeautifulSoup(bodysoup.prettify())
			
			# Parse content_title text
			h1 = bodysoup.find("h1")
			if h1:
				content_title = " ".join(h1.findAll(text=True))
				content_title = re.sub("[\s]+", " ", content_title).strip()
											
			# Reformat
			body = u"" + bodysoup.prettify().decode("UTF8")
			#print "type: %s" % type(body)
			#raise SystemExit()
			body = re.sub(r"\s+>\s+", " &gt; ", body)
			body = re.sub(r"\s+<\s+", " &lt; ", body)
			body = re.sub(r"[\n\r]+", " ", body)
			body = re.sub(r"[ \t]+", " ", body)
			body = re.sub(r">\s+", ">", body)
			body = re.sub(r"\s+<", "<", body)
			body = re.sub(r"</(p|h1|h2|h3|h4|h5|h6|ul|ol|table|tr)>", r"</\1>\n\n", body)
			body = re.sub(r"</(li|td)>", r"</\1>\n", body)
			body = re.sub(r"<(u|b|i|em|strong)>\s*", r" <\1>", body)
			body = re.sub(r"\s*</(u|b|i|em|strong)>", r"</\1> ", body)
			body = re.sub(r"</a>([^\-])", r"</a> \1", body)
			body = re.sub(r"<a ", " <a ", body)
			body = re.sub(r"\s+(\.|,|:|;|!|\?)", r"\1", body)
			
			# Is body valid UTF8?
			try:
				body.encode("UTF8")
			except UnicodeError:
				print "DAA"
				raise SystemExit()
		
		else:
			body = "(debug mode, no content parsed)"


		docsoup = BeautifulSoup(self.html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)

		# nav title
		for text in docsoup.findAll(text=re.compile("^#include")):
			m = re.compile('"(.+)"').search(text)
			if m and m.lastindex == 1:
				include_path = m.group(1)
				if include_path.find("valikko") != -1:
					if not include_path.startswith("/"):
						if self.source_path.endswith("/index.shtml"):
							include_path = os.path.join(self.path, include_path)
						else:
							include_path = os.path.join("/".join(self.path.split("/")[0:-1]), include_path)
					f = open(self.source_dir + include_path)
					navsoup = BeautifulSoup(f.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
					f.close()
					for a in navsoup.findAll("a", href="/"+self.source_path):
						if a.get("class") == "valikon_tekstit" or a.parent.get("class") == "avattu_alavalikko":
							nav_title = a.find(text=re.compile("[^\s]+")).strip()
							break

		# meta title
		try:
			meta_title = docsoup.head.title.string
		except AttributeError:
			pass
		if meta_title:
			valid_meta_title_parts = []
			for part in [part.strip() for part in meta_title.split(" - ")]:
				if part not in ("BirdLife Suomi", u"Yhdessä lintujen puolesta"):
					valid_meta_title_parts.append(part)
			meta_title = u" – ".join(valid_meta_title_parts)

		# choose best title
		self.debug("Titles: nav: '%s', meta: '%s', content: '%s'" % (nav_title, meta_title, content_title))
		if nav_title:
			title = nav_title
			self.debug("Title choice: nav_title: %s" % title)
		elif content_title and meta_title and len(content_title) < len(meta_title):
			title = content_title
			self.debug("Title choice: content_title (shorter): %s" % title)
		elif meta_title:
			title = meta_title
			self.debug("Title choice: meta_title: %s" % title)
		elif content_title:
			title = content_title
			self.debug("Title choice: content_title: %s" % title)
		else:
			title = "%s (autogen)" % self.slug.capitalize()
			self.debug("Title choice: autogenerated from slug: %s" % title)

		if not title:
			raise ImportError("No title")
			
		if self.level == 0:
			if self.slug in ("liity", "suojelu", "lintuharrastus", "julkaisut", "yhdistys"):
				self.template = "osio.html"
			#else:
			#	self.level = 0
		if self.level == 0 and self.slug == "":
			self.template = "etusivu.html"
		
		self.title = title
		self.body = body
Beispiel #15
0
 def CreateBody(self):
     '''Создаем body'''
     body = Tag(self.soup, 'body')
     totalTagsCount = random.randint(150, 400)
     '''Создаем структуру шаблона из тегов div'''
     for _ in range(random.randint(1, 3)):
         body.append(self.CreateDiv())
     divsTotalCount = totalTagsCount * random.randint(15, 25) / 100
     while divsTotalCount > 0:
         divsLowLevelList = [
             item for item in body.findAll('div')
             if len(item.findAll(True)) == 0
         ]
         divToExtend = random.choice(divsLowLevelList)
         for _ in range(random.randint(2, 4)):
             divToExtend.append(self.CreateDiv())
             divsTotalCount -= 1
     '''Получаем список тегов div разных уровней'''
     divsList = body.findAll('div')
     divsTopLevelList = [
         item for item in body.findAll('div', recursive=False)
     ]
     divsLowLevelList = [
         item for item in divsList if len(item.findAll(True)) == 0
     ]
     divsMidLevelList = [
         item for item in divsList
         if item not in divsTopLevelList and item not in divsLowLevelList
     ]
     '''Проставляем им атрибуты'''
     for item in divsTopLevelList:
         self.AppendIds(item, 95, 1)
     for item in divsMidLevelList:
         self.AppendIds(item, 20, 75)
     for item in divsLowLevelList:
         self.AppendIds(item, 30, 65)
     '''Создаем наполнение главных блоков'''
     divHeader = divsLowLevelList.pop(random.randint(0, 2))
     divHeader.string = '[header]'
     divMain = divsLowLevelList.pop(random.randint(1, 3))
     divMain.string = '[main]'
     divLinks = divsLowLevelList.pop(random.randint(-3, -1))
     divLinks.string = '[links]'
     divFooter = divsLowLevelList.pop(random.randint(-3, -1))
     divFooter.string = '[footer]'
     '''Создаем меню, сайдбары и формы'''
     for _ in range(random.randint(1, 2)):
         menu = divsLowLevelList.pop()
         menu.append(self.CreateList(0))
     for _ in range(random.randint(1, 2)):
         sidebar = divsLowLevelList.pop()
         self.CreateSidebar(sidebar)
     for _ in range(random.randint(0, 2)):
         form = divsLowLevelList.pop()
         form.append(self.CreateForm())
     '''Создаем прочее наполнение'''
     random.shuffle(divsLowLevelList)
     for _ in range(random.randint(2, 5)):
         div = divsLowLevelList.pop()
         self.CreateOthers(div)
     self.soup.html.append(body)
Beispiel #16
0
            bandListTag = column.find('li')
            albumUlTag = Tag(bandBreadCrumbsSoup, 'ul')
            bandListTag.append(albumUlTag)
            for childColumn in column.findAll('ul',
                                              attrs={'class': 'child-column'}):
                albumUlTag.append(childColumn.find('li'))
            parentUlTag.append(bandListTag)
            # ファイル生成
            resultFile = open(
                '/'.join([PARENT_DIR, 'common/bread_crumbs', fileName]), 'w')
            resultFile.write(parentUlTag.prettify())
            resultFile.close()
            print "write %s" % fileName

            # アルバムファイルのパンくずリスト
            for childColumn in albumUlTag.findAll('li'):
                childAnchorLink = childColumn.find('a')['href']
                albumParentUlTag = copy.deepcopy(parentUlTag)
                albumParentUlTag.append(copy.deepcopy(childColumn))
                # ファイル生成
                splitList = childAnchorLink.split('/')
                childFileName = '/'.join([splitList[-3], splitList[-1]
                                          ]).replace('.shtml', '.html')
                resultFile = open(
                    '/'.join(
                        [PARENT_DIR, 'common/bread_crumbs', childFileName]),
                    'w')
                resultFile.write(albumParentUlTag.prettify())
                resultFile.close()
                print "write %s" % childFileName