Esempio n. 1
0
def visit(url):
    if url.startswith(base_url) == False:
        return

    try:
        resp = urlopen(url)
    except URLError as e:
        return

    page = resp.read()
    cleaner = Cleaner()
    cleaner.javasript = True
    cleaner.style = True
    cleaner.kill_tags = ELEMENTS_TO_IGNORE

    # soup = BeautifulSoup(page, "lxml")
    # for link in soup.findAll('a'):
    # 	if link.has_attr('href'):
    # 		if link.has_attr('class') and 'history' in link['class']:
    # 			continue
    # 		next_link = urljoin(url,link['href'])
    # 		next_link = urldefrag(next_link)[0]
    # 		if next_link not in visited_pages:
    # 			visited_pages.append(next_link)
    # 			pages_to_visit.append(next_link)
    f = open("testing.txt", 'w')
    f.write(page)

    clean_page = cleaner.clean_html(page)
    f.write("\n\n\nVS\n\n\n")
    f.write(clean_page)
    f.close()
    soup = BeautifulSoup(clean_page, "lxml")
    return
    extract(soup, url)
Esempio n. 2
0
def visit(url):
	if url.startswith(base_url) == False:
		return

	try:
		resp = urlopen(url)
	except URLError as e:
		return

	page = resp.read()
	cleaner = Cleaner()
	cleaner.javasript = True
	cleaner.style = True
	cleaner.kill_tags = ELEMENTS_TO_IGNORE

	# soup = BeautifulSoup(page, "lxml")
	# for link in soup.findAll('a'):
	# 	if link.has_attr('href'):
	# 		if link.has_attr('class') and 'history' in link['class']:
	# 			continue
	# 		next_link = urljoin(url,link['href'])
	# 		next_link = urldefrag(next_link)[0]
	# 		if next_link not in visited_pages:
	# 			visited_pages.append(next_link)
	# 			pages_to_visit.append(next_link)

	clean_page = cleaner.clean_html(page)
	soup = BeautifulSoup(clean_page, "lxml")
	extract(soup, url)
Esempio n. 3
0
 def extract_text(self, url):
     try:
         if url.value.startswith('http') and '://' in url.value:
             prog = FloatProgress(min=0, max=100, description='Progress')
             display(widgets.HTML('<br/>'), prog)
             tr0 = time()
             site = self.browser.get(url.value, timeout=10)
             if site.ok:
                 prog.value += 50
                 tr1 = time() - tr0
                 t0 = time()
                 cleaner = Cleaner()
                 cleaner.javascript = True
                 cleaner.style = True
                 cleaner.kill_tags = ['header', 'footer']
                 source_tree = etree.HTML(cleaner.clean_html(site.content))
                 text = source_tree.itertext()
                 t1 = time() - t0
                 self.text = '\n'.join(
                     [n.strip() for n in text if n.strip()])
                 prog.value += 50
                 self.keywords_and_display(prog)
             else:
                 display(
                     widgets.HTML(
                         '<div style="font-size: 1.5em; margin-top:1em; margin-bottom:1em">404 - bad URL</div>'
                     ))
         else:
             self.text = url.value
             self.keywords_and_display(False)
     except Exception as e:
         print 'Error extracting text: %s' % (e)
Esempio n. 4
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
Esempio n. 5
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
Esempio n. 6
0
def remove_script_and_style(html_content):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['script']
    clean_html = cleaner.clean_html(html_content)
    return clean_html
Esempio n. 7
0
def create_plaintext_message(message):
    """ Create clean plain text version of email message

        Parse the html and remove style and javacript tags and then
        create a plain-text-message by parsing the html
        and attaching links as endnotes
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['style']
    doc = message.decode('utf-8', 'ignore')
    to_clean = lxml.html.fromstring(doc)
    cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
    plain_text_maxcols = 72
    textout = cStringIO.StringIO()
    formtext = formatter.AbstractFormatter(
        formatter.DumbWriter(textout, plain_text_maxcols))
    parser = HTMLParser(formtext)
    parser.feed(cleaned_msg)
    parser.close()
    # append the anchorlist at the bottom of a message
    # to keep the message readable.
    counter = 0
    anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
    for item in parser.anchorlist:
        counter += 1
        if item.startswith('https://'):
            new_item = item.replace('https://', 'http://')
        else:
            new_item = item
        anchorlist += "[%d] %s\n" % (counter, new_item)
    text = textout.getvalue() + anchorlist
    del textout, formtext, parser, anchorlist
    return text
Esempio n. 8
0
def lxml_extractor(html, url):
    '''LXML PARSER'''
    cleaner = Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.embedded = True
    cleaner.forms= True
    cleaner.frames = True
    cleaner.annoying_tags = True
    cleaner.kill_tags = NEGATIVE_K 
    cleaner.allow_tag = POSITIVE_K
    cleaner.safe_attrs_only = True
    #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring
    #~ value = etree.fromstring(html, parser, **kw)
    try:
        html = lxml.html.fromstring(html, base_url="url")
    
        tree = cleaner.clean_html(html)
        #tree.make_links_absolute(url)
        doc = lxml.html.tostring(tree)
        doc = soup_extractor(doc, url)
    except ValueError:
        doc = soup_extractor(html, url)
    
    #~ (title, doc, article, text) = read_extractor(html, url)
    #~ print title
    #~ doc = (self.doc).replace(unichr(160), " ")
    #~ doc = re.sub(spaces,"",self.doc)
    return doc
Esempio n. 9
0
 def init_cleaner():
     from lxml.html.clean import Cleaner
     cleaner = Cleaner()
     cleaner.javascript = False
     cleaner.style = False
     cleaner.kill_tags = ["pre", "code"]
     return cleaner
Esempio n. 10
0
    def clearTag_old(self, text: str) -> str:
        import lxml
        from lxml.html.clean import Cleaner

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.links = True
        cleaner.meta = True
        cleaner.forms = True
        cleaner.embedded = True
        cleaner.frames = True
        cleaner.remove_unknown_tags = True
        cleaner.kill_tags = ["img"]
        cleaner.remove_tags = [
            "strong",
            "div",
            "body",
            "br",
            "a",
            "p",
            "blockquote",
            "h3",
            "ol",
            "li",
            "font",
        ]
        return cleaner.clean_html(
            lxml.html.document_fromstring(text)).decode("utf-8")
Esempio n. 11
0
def create_plaintext_message(message):
        """ Create clean plain text version of email message

            Parse the html and remove style and javacript tags and then
            create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.kill_tags = ['style']
        doc = message.decode('utf-8', 'ignore')
        to_clean = lxml.html.fromstring(doc)
        cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(formatter.DumbWriter(
                                               textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(cleaned_msg)
        parser.close()
        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            if item.startswith('https://'):
                new_item = item.replace('https://', 'http://')
            else:
                new_item = item
            anchorlist += "[%d] %s\n" % (counter, new_item)
        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
def get_cleaner():
    cleaner = Cleaner()
    cleaner.embedded = True
    cleaner.frames = True
    cleaner.style = True
    cleaner.remove_unknown_tags = True
    cleaner.processing_instructions = True
    cleaner.annoying_tags = True
    cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p']
    cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul',
                         'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub']
    return cleaner
Esempio n. 13
0
def remove_tags(html_str, tags):
    content_hash = md5(html_str.encode('utf-8')).hexdigest()
    wrapper_class = f'remove-tags-wrapper-{content_hash}'

    html_str = f'<div class="{wrapper_class}">{html_str}</div>'
    tree = html.document_fromstring(html_str)

    cleaner = Cleaner()
    cleaner.kill_tags = tags.split()

    tree = cleaner.clean_html(tree)
    tree = tree.find_class(wrapper_class)[0]

    return mark_safe(html.tostring(tree).decode('utf-8'))
Esempio n. 14
0
def clean_file(file):

    cleaner = Cleaner()

    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter
    cleaner.kill_tags = ['head', 'img', 'iframe', 'nav', 'svg', 'figure', 'map']

    file = cleaner.clean_html(file)

    file = file.split()
    file = " ".join(file)

    # print(file)

    return file
Esempio n. 15
0
def get_text(session, url, title, dir):
    r = session.get(url, stream=True)
    doc = lxml.html.fromstring(r.text)
    sidebar = doc.find_class('course-sidebar')[0]
    sidebar.getparent().remove(sidebar)

    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.meta = True
    cleaner.kill_tags = ['header']

    cleantext = lxml.html.tostring(cleaner.clean_html(doc))

    filename = os.path.join(dir, title + '.html')
    with open(filename, 'w') as fout:
        print 'Downloading [T] ' + title + ' ...'
        fout.write(cleantext)
Esempio n. 16
0
    def clean(self: T) -> str:
        cleaner = Cleaner()
        cleaner.style = self.__style
        cleaner.links = self.__links
        cleaner.page_structure = self.__page_structure
        cleaner.safe_attrs_only = self.__safe_attrs_only

        # allow_tags and remove_unknown_tags can't work together
        if self.__allow_tags is not None:
            cleaner.remove_unknown_tags = False
            cleaner.allow_tags = self.__allow_tags
        if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags
        if self.__remove_tags is not None:
            cleaner.remove_tags = self.__remove_tags
        if self.__safe_attrs is not None:
            cleaner.safe_attrs = self.__safe_attrs

        self.__input = cleaner.clean_html(self.__input)
        return self.__input
Esempio n. 17
0
    def get_url(self):
        """Get the relevant part of a web page."""

        get_url = requests.get(self.data_path)
        page_data = get_url.content

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = cleaner.clean_html(page_data)

        # Strip tags from final results.
        strip_tags = TagStripper()  # Instantiate the HTML Tag Stripper.
        strip_tags.feed(page_html)  # Strip all HTML tags.

        return strip_tags.get_html_data()
Esempio n. 18
0
def extract_content(bytehtml, doc):
    """
    extracts blog post content from html
    """
    lxmldoc = lxml.html.document_fromstring(bytehtml)
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.style = True
    #cleaner.page_structure = True
    cleaner.kill_tags = ['head', 'noscript']
    cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote']
    cleaner(lxmldoc)
    content_el = find_content_element(lxmldoc)
    if content_el:
        debug(3, 'content quality {}'.format(content_el._quality))
        text = tidy_content(content_el.text_content())
        return text
    else:
        debug(2, 'no content found!')
        raise Exception('no content')
Esempio n. 19
0
    def get_url(self):
        """Get the HTML body of a web page."""

        # Create file-like object.
        outfile = StringIO.StringIO()

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img', 'li']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = lxml.html.tostring(
            cleaner.clean_html(
                lxml.html.parse(self.data_path)
            )
        )

        outfile.write(page_html)  # Write the results to this file in memory.

        return outfile
import nltk
import codecs
import sys
from bs4 import BeautifulSoup
import lxml
from lxml.html.clean import Cleaner
import re
from cStringIO import StringIO
import unicodedata

reload(sys)
sys.setdefaultencoding('utf8')
cleaner = Cleaner()
cleaner.script = True  # This is True because we want to activate the javascript filter
cleaner.style = True
cleaner.kill_tags = ['a', 'img', 'href']
cleaner.remove_tags = ['div', 'span', 'li']

directory1 = "C:\Users\Satanu\html_test\\"
directory2 = "C:\Users\Satanu\text\\"
for filename in os.listdir(directory1):
    to_write = []
    html = codecs.open(directory1 + filename, 'r', 'utf-8')
    raw = lxml.html.tostring(
        cleaner.clean_html(lxml.html.parse(directory1 + filename)))
    name = filename.strip('html')

    text = codecs.open(directory2 + filename, 'w', 'utf-8')

    text.write(raw)
Esempio n. 21
0
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = True
HTML_CLEANER.style = False
HTML_CLEANER.remove_tags = [
    'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta',
    'small', 'sub', 'sup', 'wbr'
]  #  'center', 'table', 'tbody', 'td', 'th', 'tr', 'span',
HTML_CLEANER.kill_tags = [
    'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head',
    'iframe', 'img', 'label', 'link', 'map', 'math', 'nav', 'noscript',
    'object', 'picture', 'style', 'svg', 'time', 'video'
]  # 'area', 'table' # 'header'

# validation
TEI_VALID_TAGS = set(
    ['code', 'del', 'div', 'head', 'hi', 'item', 'lb', 'list', 'p', 'quote'])
TEI_VALID_ATTRS = set(['rendition'])

# counters
tokens_posts = 0
tokens_comments = 0
lrutest = LRU(LRU_SIZE)

# justext
JUSTEXT_STOPLIST = justext.get_stoplist('German')
cleaner = Cleaner()
cleaner.comments = True
cleaner.embedded = True
cleaner.forms = False
cleaner.frames = True
cleaner.javascript = False
cleaner.links = False
cleaner.meta = False
cleaner.page_structure = True
cleaner.processing_instructions = True
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False
cleaner.scripts = False
cleaner.style = False
cleaner.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table',
    'svg', 'video'
]
# 'embed', 'figure', 'img',


def date_validator(datestring, outputformat):
    """Validate a string with respect to the chosen outputformat and basic heuristics"""
    # try if date can be parsed using chosen outputformat
    try:
        dateobject = datetime.datetime.strptime(datestring, outputformat)
    except ValueError:
        return False
    # basic year validation
    year = int(datetime.date.strftime(dateobject, '%Y'))
    if MIN_YEAR <= year <= MAX_YEAR:
        # not newer than today
Esempio n. 23
0
CLEANER = Cleaner()
CLEANER.comments = False
CLEANER.embedded = True
CLEANER.forms = False
CLEANER.frames = True
CLEANER.javascript = True
CLEANER.links = False
CLEANER.meta = False
CLEANER.page_structure = True
CLEANER.processing_instructions = True
CLEANER.remove_unknown_tags = False
CLEANER.safe_attrs_only = False
CLEANER.scripts = False
CLEANER.style = True
CLEANER.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf',
    'svg', 'video'
]  # 'embed', 'figure', 'img', 'table'

## REGEX cache
JSON_PATTERN = re.compile(
    r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})')
# use of regex module for speed
GERMAN_PATTERN = regex.compile(
    r'(?:Datum|Stand): ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})')
TIMESTAMP_PATTERN = regex.compile(
    r'([0-9]{4}-[0-9]{2}-[0-9]{2}|[0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}'
)


#@profile
def examine_date_elements(tree, expression, outputformat, extensive_search,
Esempio n. 24
0
            )
        )

    html_out.getroottree().write(file="summarized-roanoke.html", method="html")

if __name__ == "__main__":
    
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.scripts = True
    cleaner.frame = True
    cleaner.meta = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.style = True    
    cleaner.kill_tags = ["cite", "sup", "img", "noscript", "label", "video"]
        
    url = "https://en.wikipedia.org/wiki/Roanoke_Colony"
    doc = urllib2.urlopen(url)
    
    tree = lxml.html.parse(doc)
    title = tree.find(".//title").text
    
    tree = cleaner.clean_html(tree)

    netloc = urlparse(url).netloc
    if netloc == "en.wikipedia.org":
        parse_wiki(tree, title)
    elif netloc == "cnn.com":
        parse_cnn(tree, title)
    else:
Esempio n. 25
0
receivers = [(name.strip(), mail.strip()) for name, mail in rows
             if name and mail]

# Load template
with open(config.template_path) as f:
    template = f.read()

# Inline styles
import premailer
template = premailer.transform(template)

# Clean HTML
import lxml.html
from lxml.html.clean import Cleaner
cleaner = Cleaner()
cleaner.kill_tags = ['style', 'script']
page = cleaner.clean_html(lxml.html.fromstring(template))
assert not page.xpath('//style'), 'style'
assert not page.xpath('//script'), 'script'
template = lxml.html.tostring(page).decode('utf-8')

# Send mails
sender = Mailer('smtp.yandex.com.tr', port='465', use_ssl=True)
sender.login(config.user_mail, getpass('Password: '******'start')
for receiver_name, receiver_mail in receivers:
    try:
        message = Message(From=config.user_mail,
                          To=receiver_mail,
                          charset="utf-8")
        attachment_path = glob(
Esempio n. 26
0
import lxml
from lxml.html.clean import Cleaner

cleaner = Cleaner()
cleaner.javascript = True # This is True because we want to activate the javascript filter
cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
cleaner.kill_tags = ['head', 'script', 'header', 'href', 'footer']

print (lxml.html.tostring(cleaner.clean_html(lxml.html.parse('/home/caiocesare/PycharmProjects/script/1.html'))))
Esempio n. 27
0
from bs4 import BeautifulSoup
from lxml.html.clean import Cleaner
from lxml.etree import XMLSyntaxError
from store_helper import StoreHelper
from text_helper import TextHelper

cleaner = Cleaner()
cleaner.javascript = True  # This is True because we want to activate the javascript filter
cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter
cleaner.inline_style = True
cleaner.whitelist_tags = set([])
cleaner.remove_tags = [
    'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2',
    'h3', 'h4', 'h5', 'span'
]
cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label']


class HTMLHelper(object):
    @staticmethod
    def remove_tag(web_source):
        text = re.sub(r'<[^>]+>', '', web_source)
        return text

    @staticmethod
    def get_text(web_source):
        try:
            _html = lxml.html.document_fromstring(web_source)
        except XMLSyntaxError:
            print("Exception when convert web source to html document")
            return web_source
Esempio n. 28
0
parser.add_argument("-f","--file", help="metalink article name",required=True)
args = parser.parse_args()
filename = args.file

# file check
if not os.path.isfile(filename) and not os.access(sys.argv[1], os.R_OK):
    print "WARNING - Couldn't find specified file!"
    sys.exit(1)
elif not os.path.exists('original'):
    print 'Creating original directory for backups...'
    os.makedirs('original')

# cleaner
cleaner = Cleaner(page_structure=False)
cleaner.remove_tags = ["span"]
cleaner.kill_tags = ["script","img","style"]


# original file conversion
original = codecs.open(filename,"r","cp866")
for line in original:
    line = re.sub(r"[^\x00-\x7F]+","",line)
    #if "&nbsp;" in line:
        #line = re.sub(r"&nbsp;", "", line)
    if "&reg;" in line:
        line = line.replace("&reg;","")
    number = re.search(r"<span style=\"display:none\">\d+</span>", line)
    if number:
        line = re.sub(r"<span style=\"display:none\">\d+</span>", "", line)
    footer = re.search(r"Didn't find what you are looking for\?", line)
    if footer:
Esempio n. 29
0
    def get_context_data(self, **kwargs):
        headers = cache.get(self.object.id, version="email-header")
        if headers is None:
            headers = models.Header.objects.filter(part__email=self.object, part__parent=None)
            headers = headers.get_many("Subject", "From")

        email_dict = {}
        email_dict["subject"] = headers.get("Subject", "(No subject)")
        email_dict["from"] = headers["From"]
        email_dict["date"] = self.object.received_date
        email_dict["inbox"] = self.object.inbox
        email_dict["eid"] = self.object.eid

        # iterate over MIME parts
        html = None
        plain = None
        attachments = []
        for part in self.object.parts.all():
            part_head = part.header_set.get_many("Content-Type", "Content-Disposition")
            part_head["content_type"] = part_head.pop("Content-Type", "").split(";", 1)
            dispos = part_head.pop("Content-Disposition", "")

            if part_head["content_type"][0].startswith("multipart") or part_head["content_type"][0].startswith(
                "message"
            ):
                continue

            try:
                params = dict(HEADER_PARAMS.findall(part_head["content_type"][1]))
            except IndexError:
                params = {}
            params.update(dict(HEADER_PARAMS.findall(dispos)))

            # find filename, could be anywhere
            if "filename" in params:
                part_head["filename"] = params["filename"]
            elif "name" in params:
                part_head["filename"] = params["name"]
            else:
                part_head["filename"] = ""

            # grab charset
            part.charset = params.get("charset", "utf-8")

            if html is None and part_head["content_type"][0] == "text/html":
                html = part
            elif plain is None and part_head["content_type"][0] == "text/plain":
                plain = part

            attachments.append((part, part_head))

        # set raw body
        plain_message = self.find_body(html, plain)
        if plain_message is None:
            if len(attachments) == 1:
                email_dict["body"] = str(attachments[0][0].body.data)
                email_dict["charset"] = attachments[0][0].charset
            else:
                email_dict["body"] = ""
                email_dict["charset"] = "utf-8"
            plain_message = True
        elif plain_message:
            email_dict["body"] = str(plain.body.data)
            email_dict["charset"] = plain.charset
        else:
            email_dict["body"] = str(html.body.data)
            email_dict["charset"] = html.charset

        if not plain_message:
            # Mail Pile uses this, give back if you come up with something better
            cleaner = Cleaner(
                page_structure=True,
                meta=True,
                links=True,
                javascript=True,
                scripts=True,
                frames=True,
                embedded=True,
                safe_attrs_only=True,
            )
            cleaner.kill_tags = ["style", "base"]  # remove style tags, not attrs

            try:
                email_dict["body"] = Premailer(email_dict["body"]).transform()
            except Exception:
                # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything
                messages.warning(
                    self.request, _("Part of this message could not be parsed - it may not display correctly")
                )

            try:
                email_dict["body"] = cleaner.clean_html(email_dict["body"])
            except (etree.LxmlError, ValueError):
                if plain is not None and len(plain.body.data) > 0:
                    email_dict["body"] = str(plain.body.data)
                    email_dict["charset"] = plain.charset
                else:
                    email_dict["body"] = ""
                    email_dict["charset"] = "utf-8"

                plain_message = True
                messages.error(self.request, _("This email contained invalid HTML and could not be displayed"))

        self.headline = email_dict["subject"]

        # GET params for users with `ask_image` set in their profile
        if plain_message:
            # bypass image scrubber
            img_display = True
            ask_images = False
        elif "imgDisplay" in self.request.GET and int(self.request.GET["imgDisplay"]) == 1:
            img_display = True
            ask_images = False
        elif self.request.user.userprofile.flags.ask_images:
            img_display = False
            ask_images = True
        else:
            img_display = self.request.user.userprofile.flags.display_images
            ask_images = False

        # filter images if we need to
        if not img_display:
            try:
                tree = lxml_html.fromstring(email_dict["body"])
                for img in tree.findall(".//img"):
                    try:
                        del img.attrib["src"]
                    except KeyError:
                        pass
                email_dict["body"] = etree.tostring(tree)
            except (etree.LxmlError, ValueError):
                if plain is not None and len(plain.body.data) > 0:
                    email_dict["body"] = str(plain.body.data)
                    email_dict["charset"] = plain.charset
                else:
                    email_dict["body"] = ""
                    email_dict["charset"] = "utf-8"

        # convert to unicode as late as possible
        email_dict["body"] = unicode(email_dict["body"], email_dict["charset"], errors="replace")

        context = super(EmailView, self).get_context_data(**kwargs)
        context.update(
            {"email": email_dict, "plain_message": plain_message, "attachments": attachments, "ask_images": ask_images}
        )

        return context
Esempio n. 30
0
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    if include_tables is False:
        MANUALLY_CLEANED.append('table')
    if include_images is False:
        # Many websites have <img> inside <figure> or <picture> or <source> tag
        MANUALLY_CLEANED.extend(['figure', 'picture', 'source'])
        MANUALLY_STRIPPED.append('img')
    for expression in MANUALLY_CLEANED:
        for element in tree.getiterator(expression):
            try:
                element.drop_tree()
            except AttributeError:
    telefono = "".join(links[1].text_content().split())
    fax = "".join(links[2].text_content().split())
    
    if len(links[3].cssselect("a")[0].attrib['href'])> len('http://'):

        web = links[3].cssselect("a")[0].attrib['href']

    else:
    
        web = ""

    return direccion, telefono, fax, web
    

cleaner = Cleaner()
cleaner.kill_tags = ['strong']

for i in range(1,45):
    base_url = 'http://planetafan.com/cas/site/tiendas.asp?prov=0&loc=0&pag='+str(i)
    
    html = scraperwiki.scrape(base_url)
    root = lxml.html.fromstring(html)
    links = root.cssselect("ul#listado-productos li")
    
    for link in links:

        record = {}

        name = link.cssselect("a")[0].text_content()
        card_link = link.cssselect("a")[0].attrib['href']
        address = link.cssselect("p")[0].text_content()
Esempio n. 32
0
# earliest possible year to take into account (inclusive)
MIN_DATE = datetime.date(1995, 1, 1)
MIN_YEAR = MIN_DATE.year
# latest possible date
LATEST_POSSIBLE = datetime.date.today()
# latest possible year
MAX_YEAR = LATEST_POSSIBLE.year

# set an upper limit to the number of candidates
MAX_POSSIBLE_CANDIDATES = 150

# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.comments = False
HTML_CLEANER.embedded = True
HTML_CLEANER.forms = False
HTML_CLEANER.frames = True
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = True
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = True
HTML_CLEANER.kill_tags = [
    'applet', 'audio', 'canvas', 'datalist', 'embed', 'figure', 'label', 'map',
    'math', 'object', 'picture', 'rdf', 'svg', 'video'
]
Esempio n. 33
0
    def get_context_data(self, **kwargs):
        headers = cache.get(self.object.id, version="email-header")
        if headers is None:
            headers = models.Header.objects.filter(part__email=self.object, part__parent=None)
            headers = headers.get_many("Subject", "From")

        email_dict = {}
        email_dict["subject"] = headers.get("Subject", '(No subject)')
        email_dict["from"] = headers["From"]
        email_dict["date"] = self.object.received_date
        email_dict["inbox"] = self.object.inbox
        email_dict["eid"] = self.object.eid

        # iterate over MIME parts
        html = None
        plain = None
        attachments = []
        for part in self.object.parts.all():
            part_head = part.header_set.get_many("Content-Type", "Content-Disposition")
            part_head["content_type"] = part_head.pop("Content-Type", "").split(";", 1)
            dispos = part_head.pop("Content-Disposition", "")

            if part_head["content_type"][0].startswith("multipart") or part_head["content_type"][0].startswith("message"):
                continue

            try:
                params = dict(HEADER_PARAMS.findall(part_head["content_type"][1]))
            except IndexError:
                params = {}
            params.update(dict(HEADER_PARAMS.findall(dispos)))

            # find filename, could be anywhere
            if "filename" in params:
                part_head["filename"] = params["filename"]
            elif "name" in params:
                part_head["filename"] = params["name"]
            else:
                part_head["filename"] = ""

            # grab charset
            part.charset = params.get("charset", "utf-8")

            if html is None and part_head["content_type"][0] == "text/html":
                html = part
            elif plain is None and part_head["content_type"][0] == "text/plain":
                plain = part

            attachments.append((part, part_head))

        # set raw body
        plain_message = self.find_body(html, plain)
        if plain_message is None:
            if len(attachments) == 1:
                email_dict["body"] = str(attachments[0][0].body.data)
                email_dict["charset"] = attachments[0][0].charset
            else:
                email_dict["body"] = ""
                email_dict["charset"] = "utf-8"
            plain_message = True
        elif plain_message:
            email_dict["body"] = str(plain.body.data)
            email_dict["charset"] = plain.charset
        else:
            email_dict["body"] = str(html.body.data)
            email_dict["charset"] = html.charset

        if not plain_message:
            # Mail Pile uses this, give back if you come up with something better
            cleaner = Cleaner(page_structure=True, meta=True, links=True,
                       javascript=True, scripts=True, frames=True,
                       embedded=True, safe_attrs_only=True)
            cleaner.kill_tags = [
                        "style", # remove style tags, not attrs
                        "base",
                        ]

            try:
                email_dict["body"] = Premailer(email_dict["body"]).transform()
            except Exception:
                # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything
                messages.warning(self.request, _("Part of this message could not be parsed - it may not display correctly"))

            try:
                email_dict["body"] = cleaner.clean_html(email_dict["body"])
            except (etree.LxmlError, ValueError):
                if plain is not None and len(plain.body.data) > 0:
                    email_dict["body"] = str(plain.body.data)
                    email_dict["charset"] = plain.charset
                else:
                    email_dict["body"] = ""
                    email_dict["charset"] = "utf-8"

                plain_message = True
                messages.error(self.request, _("This email contained invalid HTML and could not be displayed"))

        self.headline = email_dict["subject"]

        # GET params for users with `ask_image` set in their profile
        if plain_message:
            # bypass image scrubber
            img_display = True
            ask_images = False
        elif "imgDisplay" in self.request.GET and int(self.request.GET["imgDisplay"]) == 1:
            img_display = True
            ask_images = False
        elif self.request.user.userprofile.flags.ask_images:
            img_display = False
            ask_images = True
        else:
            img_display = self.request.user.userprofile.flags.display_images
            ask_images = False

        # filter images if we need to
        if not img_display:
            try:
                tree = lxml_html.fromstring(email_dict["body"])
                for img in tree.findall(".//img"):
                    try:
                        del img.attrib["src"]
                    except KeyError:
                        pass
                email_dict["body"] = etree.tostring(tree)
            except (etree.LxmlError, ValueError):
                if plain is not None and len(plain.body.data) > 0:
                    email_dict["body"] = str(plain.body.data)
                    email_dict["charset"] = plain.charset
                else:
                    email_dict["body"] = ""
                    email_dict["charset"] = "utf-8"

        # convert to unicode as late as possible
        email_dict["body"] = unicode(email_dict["body"], email_dict["charset"], errors="replace")

        context = super(EmailView, self).get_context_data(**kwargs)
        context.update({
                        "email": email_dict,
                        "plain_message": plain_message,
                        "attachments": attachments,
                        "ask_images": ask_images,
                        })

        return context
Esempio n. 34
0
from lxml.html import fragments_fromstring
from PIL import Image, ImageDraw, ImageFont

from rs_mailer import EmailSender
import base64
import tempfile

import requests
from readability import Document
from lxml.html.clean import Cleaner

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.remove_tags = ['div', 'span']
cleaner.kill_tags = ['svg']

# This script will create an opf version of The Guardian (or The
# Observer on Sunday) suitable for turning into a .mobi file for
# copying to your Kindle.
blacklisted_section_names = ['pictures']

get_paper_articles = False

email_send = False

sleep_seconds_after_api_call = 2

# Check the path of the directory where this script is located
# to read keys and config files 
# (Ignore symbolic links)
Esempio n. 35
0
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False  # True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False  # True
HTML_CLEANER.style = False
# HTML_CLEANER.remove_tags = ['a', 'abbr', 'acronym', 'address', 'big', 'cite', 'dd', 'font', 'ins', 'meta', 'span', 'small', 'sub', 'sup', 'wbr'] #  'center', 'table', 'tbody', 'td', 'th', 'tr',
HTML_CLEANER.remove_tags = ['img']
HTML_CLEANER.kill_tags = ['aside', 'del']
# 'area', 'table' # 'header'

CUT_EMPTY_ELEMS = {
    'article', 'b', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 'li',
    'main', 'p', 'section', 'span', 'strong', 'td'
}
# 'meta',

MANUALLY_CLEANED = [
    'audio', 'blink', 'button', 'canvas', 'embed', 'figure', 'footer', 'form',
    'head', 'iframe', 'input', 'link', 'map', 'marquee', 'math', 'nav',
    'noscript', 'object', 'picture', 'script', 'style', 'svg', 'time', 'video'
]
# 'frame' 'frameset' 'source', 'img',
Esempio n. 36
0
    telefono = "".join(links[1].text_content().split())
    fax = "".join(links[2].text_content().split())

    if len(links[3].cssselect("a")[0].attrib['href']) > len('http://'):

        web = links[3].cssselect("a")[0].attrib['href']

    else:

        web = ""

    return direccion, telefono, fax, web


cleaner = Cleaner()
cleaner.kill_tags = ['strong']

for i in range(1, 45):
    base_url = 'http://planetafan.com/cas/site/tiendas.asp?prov=0&loc=0&pag=' + str(
        i)

    html = scraperwiki.scrape(base_url)
    root = lxml.html.fromstring(html)
    links = root.cssselect("ul#listado-productos li")

    for link in links:

        record = {}

        name = link.cssselect("a")[0].text_content()
        card_link = link.cssselect("a")[0].attrib['href']
Esempio n. 37
0
MIN_FILE_SIZE = 10

# Plausible dates
# earliest possible year to take into account (inclusive)
MIN_YEAR = 1995
# latest possible date
LATEST_POSSIBLE = datetime.date.today()
# latest possible year
MAX_YEAR = datetime.date.today().year

# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.comments = False
HTML_CLEANER.embedded = True
HTML_CLEANER.forms = False
HTML_CLEANER.frames = True
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = True
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = True
HTML_CLEANER.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf',
    'svg', 'video'
]
# 'embed', 'figure', 'img', 'table'