def anchortest(containerhtml,totalwords,whichcontainer):
    percentage = 65
    awords = 0
    soup = BeautifulSoup(containerhtml)
    allATags =  soup.findAll('a')
    for atag in allATags :
         asoup =  BeautifulSoup(atag.renderContents())
         atext = ' '.join([e for e in asoup.recursiveChildGenerator() if isinstance(e,unicode)])
         awords = awords + len(atext.split())
    
    alloptionTags =  soup.findAll('option')
    for aoption  in  alloptionTags :
        osoup =  BeautifulSoup(aoption.renderContents())
        otext = ' '.join([e for e in osoup.recursiveChildGenerator() if isinstance(e,unicode)])
        awords = awords + len(otext.split())
         
    mlogger.debug("how many totalwords =%d  awords =%d" % (totalwords,awords))
    if awords == 0 :
        awords = 1
    howmuch = (float(awords)/float(totalwords)) * 100
    mlogger.debug("what is the value of howmuch= %d"% howmuch)
    
    if howmuch >= percentage :
        #dont; consider as summary text
        return False
    else :
        #conider as summary text
        return True
Example #2
0
    def add_newsitem(self, entry):
        """ Add news item
        """
        title = entry.get('title', '')
        title = title.replace(' ', ' ').strip()

        description = BeautifulSoup(entry.get('summary', ''))
        description = ''.join([e for e in description.recursiveChildGenerator()
                               if isinstance(e,unicode)]).strip()

        # Descopera.ro
        index = description.find('Citeste tot articolul')
        if index != -1:
            description = description[:index]

        if not (title and description):
            return None

        url = entry.get('link', '#').strip()
        # Skip existing news
        uid = INameChooser(self.context).chooseName(title, None)
        try:
            newsitem = self.context[uid]
        except Exception, err:
            pass
def unhtml_readable(html):

	if isinstance(html, basestring):
		soup = BeautifulSoup(html)
	else:
		soup = html

	result = []

	for node in soup.recursiveChildGenerator():
		if isinstance(node, Tag):
			if node.name in ('br', 'tr'):
				result.append('\n')

		if isinstance(node, Declaration):
			# NavigableString is superclass of Declaration?
			pass

		elif isinstance(node, NavigableString):
			if node.parent.name not in EXCLUDE_CHILD_OF and not isinstance(node, EXCLUDE_INSTANCES) and not _is_hidden(node.parent):
				text = resolve_char_entities(unicode(node).strip())
				if text:
				    # append space if previous is not whitespace
					if result and result[-1] not in (' ', '\n'):
						result.append(' ')

					result.append(text)

					if node.parent.name == 'a':
						if node.parent.has_key('href') and node.parent['href'] != text:
							result.append(' [%s]' % node.parent['href'])

	return u''.join(result)
def urlify(value):
    soup = BeautifulSoup(value)

    def urlify(s):
        s = escape(s)
        s = re.sub(PROTOCOL_PATTERN, r'<a href="\1">\1</a>', s)
        s = re.sub(WOPROTOCOL_PATTERN, r'<a href="http://\1">\1</a>', s)
        return BeautifulSoup(s)

    def has_parents(node, tags):
        if node is None:
            return False
        return node.name in tags or has_parents(node.parent, tags)

    text_chunks = (c for c in soup.recursiveChildGenerator() if isinstance(c, unicode))
    for chunk in text_chunks:
        s = chunk
        if not has_parents(chunk.parent, ['a', 'code']):
            s = urlify(s)
        chunk.replaceWith(s)

    for link in soup.findAll('a'):
        if 'rel' in link:
            link['rel'] += ' '
        else:
            link['rel'] = ''
        link['rel'] += 'nofollow'

    return unicode(soup)
Example #5
0
def urlify(value):
    soup = BeautifulSoup(value)

    def urlify(s):
        s = escape(s)
        s = re.sub(PROTOCOL_PATTERN, r'<a href="\1">\1</a>', s)
        s = re.sub(WOPROTOCOL_PATTERN, r'<a href="http://\1">\1</a>', s)
        return BeautifulSoup(s)

    def has_parents(node, tags):
        if node is None:
            return False
        return node.name in tags or has_parents(node.parent, tags)

    text_chunks = (c for c in soup.recursiveChildGenerator()
                   if isinstance(c, unicode))
    for chunk in text_chunks:
        s = chunk
        if not has_parents(chunk.parent, ['a', 'code']):
            s = urlify(s)
        chunk.replaceWith(s)

    for link in soup.findAll('a'):
        if 'rel' in link:
            link['rel'] += ' '
        else:
            link['rel'] = ''
        link['rel'] += 'nofollow'

    return unicode(soup)
Example #6
0
def text_processor(htmlstrings, idealmodel, debug):
    result = {}
    structure = []
    fulltext = []

    if htmlstrings:
        lineID = 0
        for line in htmlstrings:
            lenstr = len(line)
            words = len(line.split())
            comas = len(line.split(","))
            dots = len(line.split("."))
            equal = len(line.split("="))
            soup = BeautifulSoup(line)
            if words:
                htmltags = []
                for child in soup.recursiveChildGenerator():
                    name = getattr(child, "name", None)
                    if name is not None:
                        htmltags.append(name)
                    elif not child.isspace():  # leaf node, don't print spaces
                        donothing = 1
                code = 'W' + str(words) + ',C' + str(comas) + ',D' + str(
                    dots) + ',E' + str(equal)
                result[code] = lineID

                if idealmodel:
                    if idealmodel.has_key(code):
                        # ignore
                        fulltext.append('')
                    else:
                        true = 0
                        if words:
                            true = 1
                        if 'a' in htmltags:
                            true = 0

                        if true:
                            fulltext.append(
                                str(lineID) + ' ' + str(htmltags) + ' ' +
                                code + '\n' + line)
                        else:
                            fulltext.append('')

                # Save structure line by line
                stats = {}
                stats['lineID'] = lineID
                stats['code'] = code
                stats['words'] = words
                stats['comas'] = comas
                stats['dots'] = dots
                stats['equal'] = equal
                structure.append(stats)

                #print code
                if debug:
                    print 'STATS' + str(htmltags) + ' ' + '[' + code + '] '
                    print 'TXT' + ' ' + line
            lineID = lineID + 1
    return (result, structure, fulltext)
Example #7
0
def get_level_1_elements(html_lines):
  l1s = []
  html_str = ''.join(html_lines)
  #soup = BeautifulSoup('<html>' + html_str + '</html>')
  soup = BeautifulSoup(html_str)
  for child in soup.recursiveChildGenerator():
    #if child and  not child.isspace() and hasattr(child, 'parent') and child.parent == soup:
    if hasattr(child, 'parent') and child.parent == soup:
      try:
        #if child and not child.isspace():
        #if str(child).strip():
        name = getattr(child, "name", None)
        #if name is not None:
        if name is not None:
          print "_dbg name: %s" % name
          #if name == 'h2':
          l1s.append(child)    
      except:
        pass

  print "_dbg num l1s: %d" % len(l1s)

  #i = 0
  #for l1 in l1s:
  #while i < len(l1s):
    #l1 = l1s[i]
    #print "_dbg i: %d name: %s child: %s" % (i, l1.name, l1.text)
    #i+=1
  #sys.exit(1)
  return l1s
def purgeAttributes(self, mime, _old):
    html = mime.html()
    soup = BeautifulSoup(html)
    newMime = QMimeData()
    for tag in soup.recursiveChildGenerator():
        # remove attributes in the list
        index = -1
        try:
            for key, value in tag.attrs:
                index += 1
                if key != 'style':
                    continue
                new = value.split(';')
                new = ';'.join([s for s in new
                    if s.split(':')[0].strip() not in REMOVE_ATTRIBUTES])
                tag.attrs[index] = (u'style', new)
        except AttributeError: 
            # 'NavigableString' object has no attribute 'attrs'
            pass

    # assign the modified html to new Mime
    newMime.setHtml(str(soup).decode('utf8'))

    # default _processHtml method
    return _old(self, newMime)
Example #9
0
def getPrettyText(soup):
   if soup is None:
       return ''
   text_html = str(soup)
   # prettify to break lines like HTML
   text_html = re.sub(r'</?p.*?>', '<br/>', text_html)
   text_html = re.sub(r'<li.*?>', '<br/>* ', text_html)    # text bullet
   text_html = re.sub(r'</li.*?>', '', text_html)
   text_html = text_html.replace("\n", "<br/>")
   text_soup = BeautifulSoup(text_html)
   text_line = ''
   text_lines = []
   pad_chars_re = re.compile(r'\s')
   for elem in text_soup.recursiveChildGenerator():
       if isinstance(elem, types.StringTypes) and not isinstance(elem, (Comment, ProcessingInstruction)):
           first_pad = ' ' if pad_chars_re.match(elem.string[:1]) else ''
           last_pad = ' ' if pad_chars_re.match(elem.string[-1:]) else ''
           clean_text = cleanWhitespace(elem.string)
           text_line += first_pad + clean_text + last_pad
       elif hasattr(elem, 'name') and elem.name == 'br':
           text_line = cleanWhitespace(text_line.strip())
           if len(text_line) > 0:
               text_lines.append(text_line)
           text_line = ''
   text_line = cleanWhitespace(text_line.strip())
   if len(text_line) > 0:
       text_lines.append(text_line)
   pretty_text = '\n'.join(text_lines)
   return pretty_text
Example #10
0
def htmlmatch(page, pattern):
    """Finds all the occurrencies of the pattern tree into the given html page"""
    isoup = BeautifulSoup(page)
    psoup = BeautifulSoup(pattern)

    def untiltag(gen):
        node = gen.next()
        while True:
            if isinstance(node, Tag):
                break
            elif len(node.lstrip()) == 0:
                node = gen.next()
            else:
                break
        return node

    pgen = psoup.recursiveChildGenerator()
    pnode = untiltag(pgen)
    igen = isoup.recursiveChildGenerator()
    inode = untiltag(igen)

    variables = []
    lastvars = {}

    while True:
        newvars = nodematch(inode, pnode)
        if newvars != None:
            if len(newvars) > 0:
                lastvars.update(newvars)
            try:
                pnode = untiltag(pgen)
            except StopIteration:
                pgen = psoup.recursiveChildGenerator()
                pnode = untiltag(pgen)
                if len(lastvars) > 0:
                    variables.append(lastvars)
                    lastvars = {}
        else:
            pgen = psoup.recursiveChildGenerator()
            pnode = untiltag(pgen)
        try:
            inode = untiltag(igen)
        except StopIteration:
            if variables != None:
                return variables
            return None
    return variables
Example #11
0
def htmlmatch(page, pattern):
	"""Finds all the occurrencies of the pattern tree into the given html page"""
	isoup = BeautifulSoup(page)
	psoup = BeautifulSoup(pattern)

	def untiltag(gen):
		node = gen.next()
		while True:
			if isinstance(node, Tag):
				break
			elif len(node.lstrip()) == 0:
				node = gen.next()
			else:
				break
		return node

	pgen = psoup.recursiveChildGenerator()
	pnode = untiltag(pgen)
	igen = isoup.recursiveChildGenerator()
	inode = untiltag(igen)

	variables = []
	lastvars = {}

	while True:
		newvars = nodematch(inode, pnode)
		if newvars != None:
			if len(newvars) > 0:
				lastvars.update(newvars)
			try:
				pnode = untiltag(pgen)
			except StopIteration:
				pgen = psoup.recursiveChildGenerator()
				pnode = untiltag(pgen)
				if len(lastvars) > 0:
					variables.append(lastvars)
					lastvars = {}
		else:
			pgen = psoup.recursiveChildGenerator()
			pnode = untiltag(pgen)
		try:
			inode = untiltag(igen)
		except StopIteration:
			if variables != None:
				return variables
			return None
	return variables
Example #12
0
def text_with_newlines(some_string):
    elem = BeautifulSoup(some_string)
    text = ''
    for e in elem.recursiveChildGenerator():
        if isinstance(e, basestring):
            text += e.strip()
        elif e.name == 'br':
            text += '\n'
    return text
Example #13
0
def text_with_newlines(some_string):
    elem = BeautifulSoup(some_string)
    text = ''
    for e in elem.recursiveChildGenerator():
        if isinstance(e, basestring):
            text += e.strip()
        elif e.name == 'br':
            text += '\n'
    return text
Example #14
0
def text_processor(htmlstrings, idealmodel, debug):
    result = {}
    structure = []
    fulltext = []
    
    if htmlstrings:
        lineID = 0
        for line in htmlstrings:
            lenstr = len(line)
            words = len(line.split())
            comas = len(line.split(","))
            dots = len(line.split("."))
            equal = len(line.split("="))
            soup = BeautifulSoup(line)
            if words:
                htmltags = []
                for child in soup.recursiveChildGenerator():
                    name = getattr(child, "name", None)
                    if name is not None:
                         htmltags.append(name)
                    elif not child.isspace(): # leaf node, don't print spaces
                         donothing = 1
                code = 'W' + str(words) + ',C' + str(comas) + ',D' + str(dots) + ',E' + str(equal)
                result[code] = lineID
                
                if idealmodel:
                    if idealmodel.has_key(code):
                        # ignore
                        fulltext.append('')
                    else:
                        true = 0
                        if words:
                            true = 1 
                        if 'a' in htmltags:
                            true = 0
                            
                        if true:
                            fulltext.append(str(lineID) + ' ' + str(htmltags) + ' ' + code + '\n' + line)
                        else:
                            fulltext.append('')
                
                # Save structure line by line
                stats = {}
                stats['lineID'] = lineID
                stats['code'] = code
                stats['words'] = words
                stats['comas'] = comas
                stats['dots'] = dots
                stats['equal'] = equal
                structure.append(stats)
                
                #print code
                if debug:
                    print 'STATS' + str(htmltags) + ' ' + '[' + code + '] '
                    print 'TXT' + ' ' + line
            lineID = lineID + 1
    return (result, structure, fulltext)
def theparse(url):
	gotags = []
	webpage = urllib2.urlopen(url)
	code = webpage.read()
	codesoup = BSS(code)
	for child in codesoup.recursiveChildGenerator():
		name = getattr(child, "name", None)
		if name is not None:
			gotags.append(name)
	return gotags
Example #16
0
    def add_newsitem(self, entry):
        """ Add news item
        """
        title = entry.get('title', '')
        title = title.replace('&nbsp;', ' ').strip()

        description = BeautifulSoup(entry.get('summary', ''))
        description = ''.join([e for e in description.recursiveChildGenerator()
                        if isinstance(e, unicode)]).strip()

        ptool = getToolByName(self.context, 'portal_properties')
        sanitize = getattr(ptool, 'sanitize', None)
        if sanitize:
            title_sanitize = sanitize.getProperty('subject', [])
            for expr in title_sanitize:
                title = title.replace(expr, '')
            desc_sanitize = sanitize.getProperty('body', [])
            for expr in desc_sanitize:
                description = description.replace(expr, '')

        body = description

        utils = getUtility(IText)
        description = utils.truncate(description, 20, 200)

        if not (title and description):
            return None

        url = entry.get('link', '#').strip()

        updated = entry.get('updated', None)
        if not updated:
            updated = datetime.now(bucharest)
        else:
            try:
                updated = parseDatetimetz(updated)
            except SyntaxError:
                updated = parseDatetimetz(updated.replace(' ', 'T', 1))
            except:
                updated = datetime.now(bucharest)

            # Skip news older than 30 days
            plone_ro = 'plone.ro' in url
            if not plone_ro:
                try:
                    if updated < (datetime.now() - timedelta(10)):
                        return None
                except TypeError:
                    if updated < (datetime.now(bucharest) - timedelta(10)):
                        return None
                except Exception, err:
                    logger.exception(err)
Example #17
0
def format_html_string(html_string):
    """Parse a html fragment to plain text"""
    soup = BeautifulSoup(html_string)
    clean_string = ''

    #clean_string = '\n'.join([e.replace("\r", "") for e in soup.recursiveChildGenerator() if isinstance(e, unicode)])
    for e in soup.recursiveChildGenerator():
        if isinstance(e, unicode):
            clean_string += e.replace('\r', '')
        elif isinstance(e, Tag):
            if e.name in ['p', 'br', 'div']:
                clean_string += '\n'
        else:
            pass
    lines = clean_string.split("\n")
    clean_string = "\n\n".join([line.strip() for line in lines if line.strip()])
    return clean_string
 def fetchTitle(self,url):
     mlogger.debug("fetching title")
     title = None
     try:
         page = urllib2.urlopen(url)
         soup = BeautifulSoup(page,convertEntities=BeautifulStoneSoup.HTML_ENTITIES) 
         titleTag = soup.findAll('title')
         titlesoup = BeautifulSoup(titleTag[0].renderContents(),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
         title = ''.join([e for e in titlesoup.recursiveChildGenerator() if isinstance(e,unicode)])
         mlogger.debug("what is the title ="+title)
         if len(title) > 0 :
             return title
         else:
             return None
     except:
         mlogger.debug(str(sys.exc_info()[0]))
         return None    
Example #19
0
    def html(self):
        '''
        Возвращает HTML-текст статьи, полученный фильтрацией содержимого
        через указанный фильтр.
        '''
        if self.filter in filters:
            result = filters[self.filter](self.text)
        else:
            result = linebreaks(escape(self.text))

        return mark_safe(result)

        soup = BeautifulSoup(result)

        def urlify(s):
            s = re.sub(WWW_PATTERN, r'\1http://www.', s)
            s = re.sub(FTP_PATTERN, r'\1ftp://ftp.', s)
            s = re.sub(PROTOCOL_PATTERN, r'<a href="\1\2">\1\2</a>\3\4', s)
            return BeautifulSoup(s)

        def has_parents(node, tags):
            if node is None:
                return False
            return node.name in tags or has_parents(node.parent, tags)

        text_chunks = [
            c for c in soup.recursiveChildGenerator()
            if isinstance(c, unicode)
        ]
        for chunk in text_chunks:
            s = chunk
            if not has_parents(chunk.parent, ['code']):
                s = re.sub(ur'\B--\B', u'—', s)
            if not has_parents(chunk.parent, ['a', 'code']):
                s = urlify(s)
            chunk.replaceWith(s)

        for link in soup.findAll('a'):
            if 'rel' in link:
                link['rel'] += ' '
            else:
                link['rel'] = ''
            link['rel'] += 'nofollow'
        result = unicode(soup)
        return mark_safe(result)
Example #20
0
 def on_command(self, message, command, input):
     headers = {
         'Accept': 'application/xml',
         'Accept-Language': 'en-us,en;q=0.5',
         'Accept-Charset': 'utf-8',
         'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
     }
     try:
         data = requests.get('http://www.google.com/ig/calculator', headers=headers, params={'h1': 'en', 'q': input})
         result = re.search(r'rhs: \"(.*?)\"', data.content).group(1)
         html = BeautifulSoup(unicode(result), convertEntities=BeautifulSoup.HTML_ENTITIES)
         for el in html.recursiveChildGenerator():
             if not isinstance(el, unicode) and el.name == 'sup':
                 el.replaceWith('^' + el.text)
         print html
         message.reply(html)
     except Exception, e:
         print e
         message.reply('Could not compute.')
Example #21
0
def get_items():
    count = 0
    outfile = open(qiushi, "w")
    for i in range(1, 2):
        url = "rest://qiushibaike.com/hot/page/%d" % i
        data = urllib2.urlopen(url).readlines()
        soup = BeautifulSoup("".join(data))
        contents = soup.findAll("div", "content")
        stories = [str(text) for text in contents]
        for story in stories:
            count += 1
            logging.info("processing page %d, %d items added", i, count)
            minisoup = BeautifulSoup(story)
            text = "".join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
            text = urllib.unquote(unescape(text, {"&quot;": '"'}))
            text = formalize(text).encode("utf-8")
            #            print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"
            print >> outfile, text
    outfile.close()
Example #22
0
def get_items():
    count = 0
    outfile = open(qiushi, 'w')
    for i in range(1, 2):
        url = "rest://qiushibaike.com/hot/page/%d" % i
        data = urllib2.urlopen(url).readlines()
        soup = BeautifulSoup("".join(data))
        contents = soup.findAll('div', "content")
        stories = [str(text) for text in contents]
        for story in stories:
            count += 1
            logging.info("processing page %d, %d items added", i, count)
            minisoup = BeautifulSoup(story)
            text = ''.join([
                e for e in minisoup.recursiveChildGenerator()
                if isinstance(e, unicode)
            ])
            text = urllib.unquote(unescape(text, {'&quot;': '"'}))
            text = formalize(text).encode("utf-8")
            #            print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"
            print >> outfile, text
    outfile.close()
Example #23
0
 def get_text(self):
     soup = BeautifulSoup(self.page)
     return ''.join(e.strip() for e in soup.recursiveChildGenerator() if isinstance(e, unicode))
Example #24
0
def recursiveStrip(tweet):
   soup = BeautifulSoup(unicode(tweet))
   return ''.join([e for e in soup.recursiveChildGenerator() if isinstance(e,unicode)])
Example #25
0
def remove_tag(origin):
    minisoup = BeautifulSoup(origin)
    text = "".join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
    text = urllib.unquote(unescape(text, {"&quot;": '"'}))
    text = formalize_better(formalize(text)).encode("utf-8")
    return text
Example #26
0
def defrontpagify(argv=None):

    # ########## ARGUMENTS ########## #
    #settings, args = process_command_line(argv)
    #return settings, args        # success

    # ########## VARIABLES ########## #
    # Massage input file contents
    myMassage = [(re.compile('<!-([^-])'),
                  lambda match: '<!--' + match.group(1))]
    myNewMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
    myNewMassage.extend(myMassage)

    # delete tags and all contents
    OBLITERATE_TAGS = ['meta', 'script', 'style']

    # delete tags, preserve contents
    REMOVE_TAGS = ['font', 'xxpstylexx', 'i', 'b', 'em', 'strong']
    REMOVE_TAG_WITH_ATTRIBUTE = ['language=javascript']

    # image tags to remove
    REMOVE_IMG_WITH_ATTRIBUTE = ['anarule', 'anabul']

    # replace tags
    REPLACE_TAG_WITH_DIV = ['u1']
    REPLACE_TAG_WITH_SPAN = ['st1']

    # strip attributes, preserve tag
    REMOVE_ATTRIBUTES = [
        'id', 'lang', 'language', 'onmouseover', 'onmouseout', 'script',
        'style', 'font', 'dir', 'face', 'size', 'color', 'style', 'class',
        'width', 'height', 'hspace', 'border', 'valign', 'align', 'background',
        'bgcolor', 'text', 'link', 'vlink', 'alink', 'cellpadding',
        'cellspacing', 'colspan', 'rowspan', 'bordercolordark',
        'bordercolorlight'
    ]

    # replace attributes
    #REPLACE_ATTRIBUTE = []

    # modify attribute values
    #MODIFY_ATTRIBUTE = []

    # remove &markup
    REMOVE_MARKUP = ['&nbsp']

    # ########## FUNCTIONS ########## #
    # get list of all *.htm* files in specified dir
    # loop through each, create file object, remove all MS Front Page and other HTML cruft
    # write result to new file object with '.pretty.html' appended to end of filename
    #for infile in glob.glob( os.path.join('.', '*.htm*') ):
    for infile in glob.glob('*.htm*'):
        with open(infile, 'r') as raw:

            # strip line feeds with line.strip() and markupMassage, write new files to ./souped
            outputlist = [line.strip() for line in raw]
            old = "".join(outputlist)

            # strip REMOVE_MARKUP from old using string method replace
            for x in REMOVE_MARKUP:
                old = old.replace(x, "")

            new = open(os.path.join('./souped', infile), 'w')
            soup = BeautifulSoup(old, markupMassage=myNewMassage)

            # remove all comments
            comments = soup.findAll(
                text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]

            # remove all tags and enclosed content in OBLITERATE_TAGS
            for tag in OBLITERATE_TAGS:
                tags = soup.findAll(tag)
                [oblit.extract() for oblit in tags]

            # remove all img tags with 'anarule.gif', 'anabul.gif', 'nabul.gif',
            # 'narule.gif', 'logo', or _derived in the src attribute
            imgs = soup.findAll(
                'img', src=re.compile('.*(ana|narule|nabul|logo|_derived).*'))
            [img.extract() for img in imgs]

            # remove all tags in REMOVE_TAGS, but preserve their contents.
            for tag in REMOVE_TAGS:
                for match in soup.findAll(tag):
                    match.replaceWithChildren()

            # remove all attributes in REMOVE_ATTRIBUTES from all tags,
            # but preserve the tag and its content.
            #for attribute in REMOVE_ATTRIBUTES:
            #    for tag in soup.findAll():  ## error: `attribute` reference fails
            #        del(tag[attribute])
            for tag in soup.recursiveChildGenerator():
                try:
                    tag.attrs = [(key, value) for key, value in tag.attrs
                                 if key not in REMOVE_ATTRIBUTES]
                except AttributeError:
                    # 'NavigableString' object has no attribute 'attrs'
                    pass

            # remove all carriage returns from within the contents of an anchor tag
            # edit:  this doesn't work, use dos2unix from debian repo, plus file
            # preprocessing and soup massaging at beginning of this function instead
            #anchors = soup.findAll(text=re.compile('.*\^M.*'))
            #[anchor.find(text=re.compile('.*^M.*')).extract('^M') for anchor in anchors]
            #anchors = soup.findAll("a", text="\015")
            #[anchor.find(text="\015").replaceWith(" ") for anchor in anchors]

            # remove all tables with no contents
            #tags = soup.findAll('td',text=None)
            #[tag.extract() for tag in tags]
            #tags = soup.findAll('tr',text=None)
            #[tag.extract() for tag in tags]
            #tags = soup.findAll('table',text=None)
            #[tag.extract() for tag in tags]
            for table in soup.findAll("table", text=None, recursive=True):
                if not table.text:
                    table.extract()

            # done.  write and close.
            new.write(soup.prettify())
            new.close()
 def _remove_tag(self):
     soup = BeautifulSoup(self.data)
     self.outtext = ''.join([
         element for element in soup.recursiveChildGenerator()
         if isinstance(element, unicode)
     ])
 def _remove_tag(self):
     soup=BeautifulSoup(self.data)
     self.outtext=''.join([element  for element in soup.recursiveChildGenerator() if isinstance(element,unicode)])
def fetchSummaryText(url):    
    mlogger.debug("fetching summary text")
    
    title = None
    try:
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page,convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
        #find all p tags
        allPTags =  soup.findAll('p')
        titleTag = soup.findAll('title')
        titlesoup = BeautifulSoup(titleTag[0].renderContents(),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
        title = ''.join([e for e in titlesoup.recursiveChildGenerator() if isinstance(e,unicode)])
        mlogger.debug("what is the title ="+title)
        
        for aP in allPTags :
           psoup =  BeautifulSoup(aP.renderContents(),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
           ptext = ''.join([e for e in psoup.recursiveChildGenerator() if isinstance(e,unicode)])
           #mlogger.debug("render contents="+aP.renderContents())
           #mlogger.debug("ptext ="+ptext)
                     
           if properContainer(ptext) :
                totalwords = len(ptext.split())
                result = anchortest(aP.renderContents(),totalwords,"P");
                mlogger.debug("what is the value of result="+str(result))            
                if result :
                    mlogger.debug("summarized text= %s" % ptext)
                    #if greater than 48 words pass only 48 tokens
                    if len(ptext.split())  >= 48 :
                        nsumtext =  ' '.join(ptext.split()[0:47])
                        nsumtext = nsumtext + " ...."
                        return (nsumtext,title)
                    else :
                        return (ptext,title)
                else :
                    continue
                
        #now search for div tags
        allDivTags =  soup.findAll('div')
        for aDiv in allDivTags :
            dsoup = BeautifulSoup(aDiv.renderContents(),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
            dtext = ''.join([e for e in dsoup.recursiveChildGenerator() if isinstance(e,unicode)])
            
            if properContainer(dtext) :
                totalwords = len(dtext.split())
                result = anchortest(aDiv.renderContents(),totalwords,"DIV");
                mlogger.debug("what is the value of result="+str(result))            
                if result :
                    mlogger.debug("summarized text= %s" % dtext)
                    #if greater than 48 words pass only 48 tokens
                    if len(dtext.split())  >= 48 :
                        nsumtext =  ' '.join(ptext.split()[0:47])
                        nsumtext = nsumtext + " ...."
                        return (nsumtext,title)
                    else :
                        return (dtext,title)
                else :
                    continue
                
                
        return (None,None)
        
    except:             
        mlogger.debug(str(sys.exc_info()[0]))
        return (None,title)
Example #30
0
def buildpattern(html, debug):
    doc = {}
    docwords = {}
    structure = []
    fulltext = []
    title = ''
    attributes = {}
    x = []
    y = []
    
    # HTMLDELIM = ["</title>", "</div>", "</script>", "</p>", "</li>", "</html>"]
    html = re.sub(r'<\/script>', "</script>\n", html)
    html = re.sub(r'<meta ', "\n<meta ", html)
    html = re.sub(r'<\/title>', "</title>\n", html)
    html = re.sub(r'<\/div>', "</div>\n", html)
    html = re.sub(r'<\/p>', "</p>\n", html)
    html = re.sub(r'<\/li>', "</li>\n", html)
    html = re.sub(r'<\/style>', "</style>\n", html)
    html = re.sub(r'<\/dd>', "</dd>\n", html)

    htmlstrings = html.splitlines()

    if htmlstrings:
        lineID = 0
        for line in htmlstrings:
            lenstr = len(line)
            words = len(line.split())
            comas = len(line.split(","))
            dots = len(line.split("."))
            equal = len(line.split("="))
            soup = BeautifulSoup(line)
            if words:
                htmltags = []
                visiblecontent = soup.getText()
                for child in soup.recursiveChildGenerator():
                    name = getattr(child, "name", None)
                    if name is not None:
                         htmltags.append(name)
                    elif not child.isspace(): # leaf node, don't print spaces
                         donothing = 1
                matrix = {}
                visiblewords = len(visiblecontent.split())
                matrix['words'] = str(words)
                matrix['visiblewords'] = 0
                matrix['comas'] = comas
                matrix['dots'] = dots
                matrix['equal'] = equal
                matrix['html'] = line
                matrix['tags'] = str(visiblecontent)
                code = 'W' + str(visiblewords) + ',C' + str(comas) + ',D' + str(dots) + ',E' + str(equal)
                matrix['code'] = code
                if visiblewords > 10:
                    matrix['visiblewords'] = str(visiblewords)
                doc[lineID] = matrix
            lineID = lineID + 1    
        
    if debug:
        sorted(doc, key=int)
         
        #for lineID in doc:
        for lineID,item in doc.items():
        #lineID = 1003
            if lineID:
                code = item['code']
                line = str(item['html'])
                words = item['words']
                words = item['visiblewords']
                tags = item['tags']
                x.append(lineID)
                y.append(int(words))
                #print 'W' + str(words) + ' ' + line + ' ' + code
                if words:
                    print str(lineID) + ',' + code + ',' + line + '\t' + tags
    
    return (x,y,doc)
Example #31
0
    def process(self, items):
        text = "\n".join(self.publish(item, level) for (item, level) in items)

        soup = BeautifulSoup(text)
        normalizer = getUtility(IURLNormalizer).normalize

        stack = [{'children': [], 'level': 0}]

        headings = soup.findAll(('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))

        for index, heading in enumerate(headings):
            level = int(heading.name[1])

            hid = 'section-' + normalizer(heading.string) + '-%d' % (index + 1)

            title = u''
            for string in heading.recursiveChildGenerator():
                if isinstance(string, unicode):
                    title += string.lstrip('123456789. ').strip()

            # Remove trivial headings
            if not title:
                heading.extract()
                continue

            entry = {
                'title': title,
                'id': hid,
                'children': [],
                'level': level,
            }

            i = 0
            while level <= stack[-1]['level']:
                stack.pop()
                i += 1

            stack[-1]['children'].append(entry)
            stack.append(entry)

            heading['id'] = hid

            if level == 1:
                heading.name = 'h2'
                heading['class'] = 'documentFirstHeading'

        # Make sure we start with a heading (default to 'own').
        for child in soup.recursiveChildGenerator():
            if isinstance(child, unicode):
                if child.strip('\n '):
                    hid = 'section-0'
                    title = self.context.Title().decode('utf-8')
                    soup.insert(0, '<h2 id="%s">%s</h2>' % (hid, title))
                    # stack[0]['children'].insert(
                    #    0, {'title': title,
                    #        'id': hid,
                    #        'children': [],
                    #        'level': 2,
                    #        })
                    break
            elif child.name.startswith('h'):
                break

        while len(stack[0]['children']) == 1:
            stack[0] = stack[0]['children'].pop()

        return soup, stack[0]['children']
Example #32
0
def defrontpagify(argv=None):
   
    # ########## ARGUMENTS ########## #
    #settings, args = process_command_line(argv)
    #return settings, args        # success
    
    # ########## VARIABLES ########## #
    # Massage input file contents
    myMassage = [(re.compile('<!-([^-])'), lambda match: '<!--' + match.group(1))]
    myNewMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
    myNewMassage.extend(myMassage)

    # delete tags and all contents
    OBLITERATE_TAGS = ['meta','script','style']

    # delete tags, preserve contents
    REMOVE_TAGS = ['font','xxpstylexx','i','b','em','strong'] 
    REMOVE_TAG_WITH_ATTRIBUTE = ['language=javascript']

    # image tags to remove
    REMOVE_IMG_WITH_ATTRIBUTE = ['anarule','anabul']
   
    # replace tags 
    REPLACE_TAG_WITH_DIV = ['u1']
    REPLACE_TAG_WITH_SPAN = ['st1']
  
    # strip attributes, preserve tag
    REMOVE_ATTRIBUTES = ['id','lang','language','onmouseover','onmouseout','script','style','font',
                        'dir','face','size','color','style','class','width','height','hspace',
                        'border','valign','align','background','bgcolor','text','link','vlink',
                        'alink','cellpadding','cellspacing','colspan','rowspan','bordercolordark',
                        'bordercolorlight']

    # replace attributes 
    #REPLACE_ATTRIBUTE = [] 

    # modify attribute values 
    #MODIFY_ATTRIBUTE = [] 

    # remove &markup
    REMOVE_MARKUP = ['&nbsp']
    
    
    # ########## FUNCTIONS ########## #
    # get list of all *.htm* files in specified dir 
    # loop through each, create file object, remove all MS Front Page and other HTML cruft
    # write result to new file object with '.pretty.html' appended to end of filename
    #for infile in glob.glob( os.path.join('.', '*.htm*') ):
    for infile in glob.glob('*.htm*'):
        with open(infile,'r') as raw:

            # strip line feeds with line.strip() and markupMassage, write new files to ./souped
            outputlist = [line.strip() for line in raw]
            old = "".join(outputlist)

            # strip REMOVE_MARKUP from old using string method replace 
            for x in REMOVE_MARKUP:
                old = old.replace(x,"")
            
            new = open( os.path.join('./souped', infile), 'w')
            soup = BeautifulSoup(old, markupMassage=myNewMassage)

            # remove all comments
            comments = soup.findAll(text=lambda text:isinstance(text, Comment))
            [comment.extract() for comment in comments]

            # remove all tags and enclosed content in OBLITERATE_TAGS
            for tag in OBLITERATE_TAGS:
                tags = soup.findAll(tag)
                [oblit.extract() for oblit in tags]

            # remove all img tags with 'anarule.gif', 'anabul.gif', 'nabul.gif', 
            # 'narule.gif', 'logo', or _derived in the src attribute
            imgs = soup.findAll('img', src=re.compile('.*(ana|narule|nabul|logo|_derived).*'))
            [img.extract() for img in imgs]

            # remove all tags in REMOVE_TAGS, but preserve their contents.
            for tag in REMOVE_TAGS: 
                for match in soup.findAll(tag):
                    match.replaceWithChildren() 

            # remove all attributes in REMOVE_ATTRIBUTES from all tags, 
            # but preserve the tag and its content. 
            #for attribute in REMOVE_ATTRIBUTES:
            #    for tag in soup.findAll():  ## error: `attribute` reference fails
            #        del(tag[attribute])
            for tag in soup.recursiveChildGenerator():
                try:
                    tag.attrs = [(key,value) for key,value in tag.attrs
                        if key not in REMOVE_ATTRIBUTES]
                except AttributeError: 
                   # 'NavigableString' object has no attribute 'attrs'
                    pass

            # remove all carriage returns from within the contents of an anchor tag
            # edit:  this doesn't work, use dos2unix from debian repo, plus file 
            # preprocessing and soup massaging at beginning of this function instead
            #anchors = soup.findAll(text=re.compile('.*\^M.*'))
            #[anchor.find(text=re.compile('.*^M.*')).extract('^M') for anchor in anchors]
            #anchors = soup.findAll("a", text="\015")
            #[anchor.find(text="\015").replaceWith(" ") for anchor in anchors]

            # remove all tables with no contents
            #tags = soup.findAll('td',text=None) 
            #[tag.extract() for tag in tags]
            #tags = soup.findAll('tr',text=None) 
            #[tag.extract() for tag in tags]
            #tags = soup.findAll('table',text=None) 
            #[tag.extract() for tag in tags]
            for table in soup.findAll("table",text=None,recursive=True):
                if not table.text:
                    table.extract()
             
            # done.  write and close.
            new.write(soup.prettify())
            new.close()
Example #33
0
from depot.models import Resource, Location

import logging
logger = logging.getLogger('aliss')

# probably move this code to utils.py if enough
def get_url_content(url):
    """takes a url and returns the text content of the page"""
    try:
        response = urllib2.urlopen(url)
        htmltext = response.read()
    except urllib2.HTTPError, e:
        raise Exception(BaseHTTPRequestHandler.responses[e.code])
    
    soup = BeautifulSoup(htmltext)
    result = ''.join([e for e in soup.recursiveChildGenerator() if isinstance(e,unicode)])
    return result.encode('ascii', 'ignore')

from placemaker import placemaker

class geomaker(object):
    """docstring for geomaker"""
    def __init__(self, content):
        self.content = content
    
    def find_places(self):
        """docstring for places"""
        if self.content.startswith('http'):
            data = get_url_content(self.content)
        else:
            data = self.content
Example #34
0
import logging
logger = logging.getLogger('aliss')


# probably move this code to utils.py if enough
def get_url_content(url):
    """takes a url and returns the text content of the page"""
    try:
        response = urllib2.urlopen(url)
        htmltext = response.read()
    except urllib2.HTTPError, e:
        raise Exception(BaseHTTPRequestHandler.responses[e.code])

    soup = BeautifulSoup(htmltext)
    result = ''.join(
        [e for e in soup.recursiveChildGenerator() if isinstance(e, unicode)])
    return result.encode('ascii', 'ignore')


from placemaker import placemaker


class geomaker(object):
    """docstring for geomaker"""
    def __init__(self, content):
        self.content = content

    def find_places(self):
        """docstring for places"""
        if self.content.startswith('http'):
            data = get_url_content(self.content)
Example #35
0
def buildpattern(html, debug):
    doc = {}
    docwords = {}
    structure = []
    fulltext = []
    title = ''
    attributes = {}
    x = []
    y = []
    
    # HTMLDELIM = ["</title>", "</div>", "</script>", "</p>", "</li>", "</html>"]
    html = re.sub(r'<script', "\n<script", html)
    html = re.sub(r'<style', "\n<style", html)
    html = re.sub(r'<\/script>', "\n</script>\n", html)
    html = re.sub(r'<meta ', "\n<meta ", html)
    html = re.sub(r'<\/title>', "</title>\n", html)
    html = re.sub(r'<\/div>', "</div>\n", html)
    html = re.sub(r'<\/p>', "</p>\n", html)
    html = re.sub(r'<\/li>', "</li>\n", html)
    html = re.sub(r'<\/style>', "\n</style>\n", html)
    html = re.sub(r'<\/dd>', "</dd>\n", html)

    htmlstrings = html.splitlines()

    if htmlstrings:
        lineID = 0
        active = 1
        for line in htmlstrings:
            lenstr = len(line)
            words = len(line.split())
            comas = len(line.split(","))
            dots = len(line.split("."))
            equal = len(line.split("="))
            soup = BeautifulSoup(line)
            
            if words:
                htmltags = []
                visiblecontent = soup.getText()
                for child in soup.recursiveChildGenerator():
                    name = getattr(child, "name", None)
                    if name is not None:
                         htmltags.append(name)
                    elif not child.isspace(): # leaf node, don't print spaces
                         donothing = 1
                matrix = {}
                visiblewords = len(visiblecontent.split())
                
                openignore = re.match(r'<style|<script', line)
                closeignore = re.match(r'<\/style|<\/script', line)
                urlstatus = re.findall(r'<a', line)
                timeflag = re.findall('([0-9]+:[0-9]+)', line)
                if openignore:
                    active = 0 
                            
                matrix['words'] = str(words)
                matrix['visiblewords'] = 0
                matrix['comas'] = comas
                matrix['dots'] = dots
                matrix['equal'] = equal
                matrix['html'] = line
                matrix['status'] = 'active'
                if timeflag:
                    matrix['timeflag'] = str(timeflag)
                else:
                    matrix['timeflag'] = ''
                matrix['tags'] = str(visiblecontent)
                if urlstatus:
                    matrix['urlstatus'] = 1
                else:
                    matrix['urlstatus'] = 0
                code = 'W' + str(visiblewords) + ',C' + str(comas) + ',D' + str(dots) + ',E' + str(equal) + ',U' + str(matrix['urlstatus']) + 'T' + matrix['timeflag']
                matrix['code'] = code
                if visiblewords > 0:
                    matrix['visiblewords'] = str(visiblewords)
                if active == 0:
                    matrix['visiblewords'] = 0
                    matrix['status'] = 'ignored'
                if visiblewords <= 1:
                    matrix['status'] = 'ignored'
                doc[lineID] = matrix

                if closeignore:
                    active = 1

            lineID = lineID + 1    
        
    if debug:
        sorted(doc, key=int)
        
        #for lineID in doc:
        for lineID,item in doc.items():
            line = str(item['html'])
            openignore = re.match(r'<style|<script', line)
            closeignore = re.match(r'<\/style|<\/script', line)
        #lineID = 1003
            if lineID:
                code = item['code']
                words = item['words']
                words = item['visiblewords']
                tags = item['tags']
                status = item['status']
                x.append(lineID)
                y.append(int(words))
                if status == 'active':
                    outstr = str(lineID) + ',' + code + ',' + line + '\t' + tags                    
                    #print outstr + '\n'
                    f.write(outstr + '\n') # python will convert \n to os.linesep

    return (x,y,doc)