コード例 #1
0
ファイル: experiment.py プロジェクト: veeann/example-mining
def visit(url):
    if url.startswith(base_url) == False:
        return

    try:
        resp = urlopen(url)
    except URLError as e:
        return

    page = resp.read()
    cleaner = Cleaner()
    cleaner.javasript = True
    cleaner.style = True
    cleaner.kill_tags = ELEMENTS_TO_IGNORE

    # soup = BeautifulSoup(page, "lxml")
    # for link in soup.findAll('a'):
    # 	if link.has_attr('href'):
    # 		if link.has_attr('class') and 'history' in link['class']:
    # 			continue
    # 		next_link = urljoin(url,link['href'])
    # 		next_link = urldefrag(next_link)[0]
    # 		if next_link not in visited_pages:
    # 			visited_pages.append(next_link)
    # 			pages_to_visit.append(next_link)
    f = open("testing.txt", 'w')
    f.write(page)

    clean_page = cleaner.clean_html(page)
    f.write("\n\n\nVS\n\n\n")
    f.write(clean_page)
    f.close()
    soup = BeautifulSoup(clean_page, "lxml")
    return
    extract(soup, url)
コード例 #2
0
ファイル: mine.py プロジェクト: veeann/example-mining
def visit(url):
	if url.startswith(base_url) == False:
		return

	try:
		resp = urlopen(url)
	except URLError as e:
		return

	page = resp.read()
	cleaner = Cleaner()
	cleaner.javasript = True
	cleaner.style = True
	cleaner.kill_tags = ELEMENTS_TO_IGNORE

	# soup = BeautifulSoup(page, "lxml")
	# for link in soup.findAll('a'):
	# 	if link.has_attr('href'):
	# 		if link.has_attr('class') and 'history' in link['class']:
	# 			continue
	# 		next_link = urljoin(url,link['href'])
	# 		next_link = urldefrag(next_link)[0]
	# 		if next_link not in visited_pages:
	# 			visited_pages.append(next_link)
	# 			pages_to_visit.append(next_link)

	clean_page = cleaner.clean_html(page)
	soup = BeautifulSoup(clean_page, "lxml")
	extract(soup, url)
コード例 #3
0
 def extract_text(self, url):
     try:
         if url.value.startswith('http') and '://' in url.value:
             prog = FloatProgress(min=0, max=100, description='Progress')
             display(widgets.HTML('<br/>'), prog)
             tr0 = time()
             site = self.browser.get(url.value, timeout=10)
             if site.ok:
                 prog.value += 50
                 tr1 = time() - tr0
                 t0 = time()
                 cleaner = Cleaner()
                 cleaner.javascript = True
                 cleaner.style = True
                 cleaner.kill_tags = ['header', 'footer']
                 source_tree = etree.HTML(cleaner.clean_html(site.content))
                 text = source_tree.itertext()
                 t1 = time() - t0
                 self.text = '\n'.join(
                     [n.strip() for n in text if n.strip()])
                 prog.value += 50
                 self.keywords_and_display(prog)
             else:
                 display(
                     widgets.HTML(
                         '<div style="font-size: 1.5em; margin-top:1em; margin-bottom:1em">404 - bad URL</div>'
                     ))
         else:
             self.text = url.value
             self.keywords_and_display(False)
     except Exception as e:
         print 'Error extracting text: %s' % (e)
コード例 #4
0
ファイル: Task1.py プロジェクト: dipanjan44/Python-Projects
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
コード例 #5
0
ファイル: crawl.py プロジェクト: mmzz42/hactar
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
コード例 #6
0
def remove_script_and_style(html_content):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['script']
    clean_html = cleaner.clean_html(html_content)
    return clean_html
コード例 #7
0
def create_plaintext_message(message):
    """ Create clean plain text version of email message

        Parse the html and remove style and javacript tags and then
        create a plain-text-message by parsing the html
        and attaching links as endnotes
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['style']
    doc = message.decode('utf-8', 'ignore')
    to_clean = lxml.html.fromstring(doc)
    cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
    plain_text_maxcols = 72
    textout = cStringIO.StringIO()
    formtext = formatter.AbstractFormatter(
        formatter.DumbWriter(textout, plain_text_maxcols))
    parser = HTMLParser(formtext)
    parser.feed(cleaned_msg)
    parser.close()
    # append the anchorlist at the bottom of a message
    # to keep the message readable.
    counter = 0
    anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
    for item in parser.anchorlist:
        counter += 1
        if item.startswith('https://'):
            new_item = item.replace('https://', 'http://')
        else:
            new_item = item
        anchorlist += "[%d] %s\n" % (counter, new_item)
    text = textout.getvalue() + anchorlist
    del textout, formtext, parser, anchorlist
    return text
コード例 #8
0
ファイル: extractor.py プロジェクト: cortext/crawtext
def lxml_extractor(html, url):
    '''LXML PARSER'''
    cleaner = Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.embedded = True
    cleaner.forms= True
    cleaner.frames = True
    cleaner.annoying_tags = True
    cleaner.kill_tags = NEGATIVE_K 
    cleaner.allow_tag = POSITIVE_K
    cleaner.safe_attrs_only = True
    #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring
    #~ value = etree.fromstring(html, parser, **kw)
    try:
        html = lxml.html.fromstring(html, base_url="url")
    
        tree = cleaner.clean_html(html)
        #tree.make_links_absolute(url)
        doc = lxml.html.tostring(tree)
        doc = soup_extractor(doc, url)
    except ValueError:
        doc = soup_extractor(html, url)
    
    #~ (title, doc, article, text) = read_extractor(html, url)
    #~ print title
    #~ doc = (self.doc).replace(unichr(160), " ")
    #~ doc = re.sub(spaces,"",self.doc)
    return doc
コード例 #9
0
 def init_cleaner():
     from lxml.html.clean import Cleaner
     cleaner = Cleaner()
     cleaner.javascript = False
     cleaner.style = False
     cleaner.kill_tags = ["pre", "code"]
     return cleaner
コード例 #10
0
    def clearTag_old(self, text: str) -> str:
        import lxml
        from lxml.html.clean import Cleaner

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.links = True
        cleaner.meta = True
        cleaner.forms = True
        cleaner.embedded = True
        cleaner.frames = True
        cleaner.remove_unknown_tags = True
        cleaner.kill_tags = ["img"]
        cleaner.remove_tags = [
            "strong",
            "div",
            "body",
            "br",
            "a",
            "p",
            "blockquote",
            "h3",
            "ol",
            "li",
            "font",
        ]
        return cleaner.clean_html(
            lxml.html.document_fromstring(text)).decode("utf-8")
コード例 #11
0
ファイル: mailer.py プロジェクト: a25kk/julia
def create_plaintext_message(message):
        """ Create clean plain text version of email message

            Parse the html and remove style and javacript tags and then
            create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.kill_tags = ['style']
        doc = message.decode('utf-8', 'ignore')
        to_clean = lxml.html.fromstring(doc)
        cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(formatter.DumbWriter(
                                               textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(cleaned_msg)
        parser.close()
        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            if item.startswith('https://'):
                new_item = item.replace('https://', 'http://')
            else:
                new_item = item
            anchorlist += "[%d] %s\n" % (counter, new_item)
        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
def get_cleaner():
    cleaner = Cleaner()
    cleaner.embedded = True
    cleaner.frames = True
    cleaner.style = True
    cleaner.remove_unknown_tags = True
    cleaner.processing_instructions = True
    cleaner.annoying_tags = True
    cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p']
    cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul',
                         'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub']
    return cleaner
コード例 #13
0
def remove_tags(html_str, tags):
    content_hash = md5(html_str.encode('utf-8')).hexdigest()
    wrapper_class = f'remove-tags-wrapper-{content_hash}'

    html_str = f'<div class="{wrapper_class}">{html_str}</div>'
    tree = html.document_fromstring(html_str)

    cleaner = Cleaner()
    cleaner.kill_tags = tags.split()

    tree = cleaner.clean_html(tree)
    tree = tree.find_class(wrapper_class)[0]

    return mark_safe(html.tostring(tree).decode('utf-8'))
コード例 #14
0
def clean_file(file):

    cleaner = Cleaner()

    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter
    cleaner.kill_tags = ['head', 'img', 'iframe', 'nav', 'svg', 'figure', 'map']

    file = cleaner.clean_html(file)

    file = file.split()
    file = " ".join(file)

    # print(file)

    return file
コード例 #15
0
def get_text(session, url, title, dir):
    r = session.get(url, stream=True)
    doc = lxml.html.fromstring(r.text)
    sidebar = doc.find_class('course-sidebar')[0]
    sidebar.getparent().remove(sidebar)

    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.meta = True
    cleaner.kill_tags = ['header']

    cleantext = lxml.html.tostring(cleaner.clean_html(doc))

    filename = os.path.join(dir, title + '.html')
    with open(filename, 'w') as fout:
        print 'Downloading [T] ' + title + ' ...'
        fout.write(cleantext)
コード例 #16
0
    def clean(self: T) -> str:
        cleaner = Cleaner()
        cleaner.style = self.__style
        cleaner.links = self.__links
        cleaner.page_structure = self.__page_structure
        cleaner.safe_attrs_only = self.__safe_attrs_only

        # allow_tags and remove_unknown_tags can't work together
        if self.__allow_tags is not None:
            cleaner.remove_unknown_tags = False
            cleaner.allow_tags = self.__allow_tags
        if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags
        if self.__remove_tags is not None:
            cleaner.remove_tags = self.__remove_tags
        if self.__safe_attrs is not None:
            cleaner.safe_attrs = self.__safe_attrs

        self.__input = cleaner.clean_html(self.__input)
        return self.__input
コード例 #17
0
ファイル: Data.py プロジェクト: redteamcaliber/intel
    def get_url(self):
        """Get the relevant part of a web page."""

        get_url = requests.get(self.data_path)
        page_data = get_url.content

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = cleaner.clean_html(page_data)

        # Strip tags from final results.
        strip_tags = TagStripper()  # Instantiate the HTML Tag Stripper.
        strip_tags.feed(page_html)  # Strip all HTML tags.

        return strip_tags.get_html_data()
コード例 #18
0
ファイル: blogpostparser.py プロジェクト: wo/opp-tools
def extract_content(bytehtml, doc):
    """
    extracts blog post content from html
    """
    lxmldoc = lxml.html.document_fromstring(bytehtml)
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.style = True
    #cleaner.page_structure = True
    cleaner.kill_tags = ['head', 'noscript']
    cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote']
    cleaner(lxmldoc)
    content_el = find_content_element(lxmldoc)
    if content_el:
        debug(3, 'content quality {}'.format(content_el._quality))
        text = tidy_content(content_el.text_content())
        return text
    else:
        debug(2, 'no content found!')
        raise Exception('no content')
コード例 #19
0
ファイル: Data.py プロジェクト: lukaszbb/openiocscripts
    def get_url(self):
        """Get the HTML body of a web page."""

        # Create file-like object.
        outfile = StringIO.StringIO()

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img', 'li']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = lxml.html.tostring(
            cleaner.clean_html(
                lxml.html.parse(self.data_path)
            )
        )

        outfile.write(page_html)  # Write the results to this file in memory.

        return outfile
コード例 #20
0
import nltk
import codecs
import sys
from bs4 import BeautifulSoup
import lxml
from lxml.html.clean import Cleaner
import re
from cStringIO import StringIO
import unicodedata

reload(sys)
sys.setdefaultencoding('utf8')
cleaner = Cleaner()
cleaner.script = True  # This is True because we want to activate the javascript filter
cleaner.style = True
cleaner.kill_tags = ['a', 'img', 'href']
cleaner.remove_tags = ['div', 'span', 'li']

directory1 = "C:\Users\Satanu\html_test\\"
directory2 = "C:\Users\Satanu\text\\"
for filename in os.listdir(directory1):
    to_write = []
    html = codecs.open(directory1 + filename, 'r', 'utf-8')
    raw = lxml.html.tostring(
        cleaner.clean_html(lxml.html.parse(directory1 + filename)))
    name = filename.strip('html')

    text = codecs.open(directory2 + filename, 'w', 'utf-8')

    text.write(raw)
コード例 #21
0
ファイル: core.py プロジェクト: DerKozmonaut/trafilatura
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = True
HTML_CLEANER.style = False
HTML_CLEANER.remove_tags = [
    'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta',
    'small', 'sub', 'sup', 'wbr'
]  #  'center', 'table', 'tbody', 'td', 'th', 'tr', 'span',
HTML_CLEANER.kill_tags = [
    'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head',
    'iframe', 'img', 'label', 'link', 'map', 'math', 'nav', 'noscript',
    'object', 'picture', 'style', 'svg', 'time', 'video'
]  # 'area', 'table' # 'header'

# validation
TEI_VALID_TAGS = set(
    ['code', 'del', 'div', 'head', 'hi', 'item', 'lb', 'list', 'p', 'quote'])
TEI_VALID_ATTRS = set(['rendition'])

# counters
tokens_posts = 0
tokens_comments = 0
lrutest = LRU(LRU_SIZE)

# justext
JUSTEXT_STOPLIST = justext.get_stoplist('German')
コード例 #22
0
cleaner = Cleaner()
cleaner.comments = True
cleaner.embedded = True
cleaner.forms = False
cleaner.frames = True
cleaner.javascript = False
cleaner.links = False
cleaner.meta = False
cleaner.page_structure = True
cleaner.processing_instructions = True
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False
cleaner.scripts = False
cleaner.style = False
cleaner.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table',
    'svg', 'video'
]
# 'embed', 'figure', 'img',


def date_validator(datestring, outputformat):
    """Validate a string with respect to the chosen outputformat and basic heuristics"""
    # try if date can be parsed using chosen outputformat
    try:
        dateobject = datetime.datetime.strptime(datestring, outputformat)
    except ValueError:
        return False
    # basic year validation
    year = int(datetime.date.strftime(dateobject, '%Y'))
    if MIN_YEAR <= year <= MAX_YEAR:
        # not newer than today
コード例 #23
0
ファイル: core.py プロジェクト: mpk112/htmldate
CLEANER = Cleaner()
CLEANER.comments = False
CLEANER.embedded = True
CLEANER.forms = False
CLEANER.frames = True
CLEANER.javascript = True
CLEANER.links = False
CLEANER.meta = False
CLEANER.page_structure = True
CLEANER.processing_instructions = True
CLEANER.remove_unknown_tags = False
CLEANER.safe_attrs_only = False
CLEANER.scripts = False
CLEANER.style = True
CLEANER.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf',
    'svg', 'video'
]  # 'embed', 'figure', 'img', 'table'

## REGEX cache
JSON_PATTERN = re.compile(
    r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})')
# use of regex module for speed
GERMAN_PATTERN = regex.compile(
    r'(?:Datum|Stand): ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})')
TIMESTAMP_PATTERN = regex.compile(
    r'([0-9]{4}-[0-9]{2}-[0-9]{2}|[0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}'
)


#@profile
def examine_date_elements(tree, expression, outputformat, extensive_search,
コード例 #24
0
ファイル: parser.py プロジェクト: mcnultyc/summarize-document
            )
        )

    html_out.getroottree().write(file="summarized-roanoke.html", method="html")

if __name__ == "__main__":
    
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.scripts = True
    cleaner.frame = True
    cleaner.meta = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.style = True    
    cleaner.kill_tags = ["cite", "sup", "img", "noscript", "label", "video"]
        
    url = "https://en.wikipedia.org/wiki/Roanoke_Colony"
    doc = urllib2.urlopen(url)
    
    tree = lxml.html.parse(doc)
    title = tree.find(".//title").text
    
    tree = cleaner.clean_html(tree)

    netloc = urlparse(url).netloc
    if netloc == "en.wikipedia.org":
        parse_wiki(tree, title)
    elif netloc == "cnn.com":
        parse_cnn(tree, title)
    else:
コード例 #25
0
receivers = [(name.strip(), mail.strip()) for name, mail in rows
             if name and mail]

# Load template
with open(config.template_path) as f:
    template = f.read()

# Inline styles
import premailer
template = premailer.transform(template)

# Clean HTML
import lxml.html
from lxml.html.clean import Cleaner
cleaner = Cleaner()
cleaner.kill_tags = ['style', 'script']
page = cleaner.clean_html(lxml.html.fromstring(template))
assert not page.xpath('//style'), 'style'
assert not page.xpath('//script'), 'script'
template = lxml.html.tostring(page).decode('utf-8')

# Send mails
sender = Mailer('smtp.yandex.com.tr', port='465', use_ssl=True)
sender.login(config.user_mail, getpass('Password: '******'start')
for receiver_name, receiver_mail in receivers:
    try:
        message = Message(From=config.user_mail,
                          To=receiver_mail,
                          charset="utf-8")
        attachment_path = glob(
コード例 #26
0
import lxml
from lxml.html.clean import Cleaner

cleaner = Cleaner()
cleaner.javascript = True # This is True because we want to activate the javascript filter
cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
cleaner.kill_tags = ['head', 'script', 'header', 'href', 'footer']

print (lxml.html.tostring(cleaner.clean_html(lxml.html.parse('/home/caiocesare/PycharmProjects/script/1.html'))))
コード例 #27
0
from bs4 import BeautifulSoup
from lxml.html.clean import Cleaner
from lxml.etree import XMLSyntaxError
from store_helper import StoreHelper
from text_helper import TextHelper

cleaner = Cleaner()
cleaner.javascript = True  # This is True because we want to activate the javascript filter
cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter
cleaner.inline_style = True
cleaner.whitelist_tags = set([])
cleaner.remove_tags = [
    'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2',
    'h3', 'h4', 'h5', 'span'
]
cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label']


class HTMLHelper(object):
    @staticmethod
    def remove_tag(web_source):
        text = re.sub(r'<[^>]+>', '', web_source)
        return text

    @staticmethod
    def get_text(web_source):
        try:
            _html = lxml.html.document_fromstring(web_source)
        except XMLSyntaxError:
            print("Exception when convert web source to html document")
            return web_source
コード例 #28
0
ファイル: metalink.py プロジェクト: transient-sepia/metalink
parser.add_argument("-f","--file", help="metalink article name",required=True)
args = parser.parse_args()
filename = args.file

# file check
if not os.path.isfile(filename) and not os.access(sys.argv[1], os.R_OK):
    print "WARNING - Couldn't find specified file!"
    sys.exit(1)
elif not os.path.exists('original'):
    print 'Creating original directory for backups...'
    os.makedirs('original')

# cleaner
cleaner = Cleaner(page_structure=False)
cleaner.remove_tags = ["span"]
cleaner.kill_tags = ["script","img","style"]


# original file conversion
original = codecs.open(filename,"r","cp866")
for line in original:
    line = re.sub(r"[^\x00-\x7F]+","",line)
    #if "&nbsp;" in line:
        #line = re.sub(r"&nbsp;", "", line)
    if "&reg;" in line:
        line = line.replace("&reg;","")
    number = re.search(r"<span style=\"display:none\">\d+</span>", line)
    if number:
        line = re.sub(r"<span style=\"display:none\">\d+</span>", "", line)
    footer = re.search(r"Didn't find what you are looking for\?", line)
    if footer:
コード例 #29
0
ファイル: email.py プロジェクト: jesskay/website
    def get_context_data(self, **kwargs):
        headers = cache.get(self.object.id, version="email-header")
        if headers is None:
            headers = models.Header.objects.filter(part__email=self.object, part__parent=None)
            headers = headers.get_many("Subject", "From")

        email_dict = {}
        email_dict["subject"] = headers.get("Subject", "(No subject)")
        email_dict["from"] = headers["From"]
        email_dict["date"] = self.object.received_date
        email_dict["inbox"] = self.object.inbox
        email_dict["eid"] = self.object.eid

        # iterate over MIME parts
        html = None
        plain = None
        attachments = []
        for part in self.object.parts.all():
            part_head = part.header_set.get_many("Content-Type", "Content-Disposition")
            part_head["content_type"] = part_head.pop("Content-Type", "").split(";", 1)
            dispos = part_head.pop("Content-Disposition", "")

            if part_head["content_type"][0].startswith("multipart") or part_head["content_type"][0].startswith(
                "message"
            ):
                continue

            try:
                params = dict(HEADER_PARAMS.findall(part_head["content_type"][1]))
            except IndexError:
                params = {}
            params.update(dict(HEADER_PARAMS.findall(dispos)))

            # find filename, could be anywhere
            if "filename" in params:
                part_head["filename"] = params["filename"]
            elif "name" in params:
                part_head["filename"] = params["name"]
            else:
                part_head["filename"] = ""

            # grab charset
            part.charset = params.get("charset", "utf-8")

            if html is None and part_head["content_type"][0] == "text/html":
                html = part
            elif plain is None and part_head["content_type"][0] == "text/plain":
                plain = part

            attachments.append((part, part_head))

        # set raw body
        plain_message = self.find_body(html, plain)
        if plain_message is None:
            if len(attachments) == 1:
                email_dict["body"] = str(attachments[0][0].body.data)
                email_dict["charset"] = attachments[0][0].charset
            else:
                email_dict["body"] = ""
                email_dict["charset"] = "utf-8"
            plain_message = True
        elif plain_message:
            email_dict["body"] = str(plain.body.data)
            email_dict["charset"] = plain.charset
        else:
            email_dict["body"] = str(html.body.data)
            email_dict["charset"] = html.charset

        if not plain_message:
            # Mail Pile uses this, give back if you come up with something better
            cleaner = Cleaner(
                page_structure=True,
                meta=True,
                links=True,
                javascript=True,
                scripts=True,
                frames=True,
                embedded=True,
                safe_attrs_only=True,
            )
            cleaner.kill_tags = ["style", "base"]  # remove style tags, not attrs

            try:
                email_dict["body"] = Premailer(email_dict["body"]).transform()
            except Exception:
                # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything
                messages.warning(
                    self.request, _("Part of this message could not be parsed - it may not display correctly")
                )

            try:
                email_dict["body"] = cleaner.clean_html(email_dict["body"])
            except (etree.LxmlError, ValueError):
                if plain is not None and len(plain.body.data) > 0:
                    email_dict["body"] = str(plain.body.data)
                    email_dict["charset"] = plain.charset
                else:
                    email_dict["body"] = ""
                    email_dict["charset"] = "utf-8"

                plain_message = True
                messages.error(self.request, _("This email contained invalid HTML and could not be displayed"))

        self.headline = email_dict["subject"]

        # GET params for users with `ask_image` set in their profile
        if plain_message:
            # bypass image scrubber
            img_display = True
            ask_images = False
        elif "imgDisplay" in self.request.GET and int(self.request.GET["imgDisplay"]) == 1:
            img_display = True
            ask_images = False
        elif self.request.user.userprofile.flags.ask_images:
            img_display = False
            ask_images = True
        else:
            img_display = self.request.user.userprofile.flags.display_images
            ask_images = False

        # filter images if we need to
        if not img_display:
            try:
                tree = lxml_html.fromstring(email_dict["body"])
                for img in tree.findall(".//img"):
                    try:
                        del img.attrib["src"]
                    except KeyError:
                        pass
                email_dict["body"] = etree.tostring(tree)
            except (etree.LxmlError, ValueError):
                if plain is not None and len(plain.body.data) > 0:
                    email_dict["body"] = str(plain.body.data)
                    email_dict["charset"] = plain.charset
                else:
                    email_dict["body"] = ""
                    email_dict["charset"] = "utf-8"

        # convert to unicode as late as possible
        email_dict["body"] = unicode(email_dict["body"], email_dict["charset"], errors="replace")

        context = super(EmailView, self).get_context_data(**kwargs)
        context.update(
            {"email": email_dict, "plain_message": plain_message, "attachments": attachments, "ask_images": ask_images}
        )

        return context
コード例 #30
0
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    if include_tables is False:
        MANUALLY_CLEANED.append('table')
    if include_images is False:
        # Many websites have <img> inside <figure> or <picture> or <source> tag
        MANUALLY_CLEANED.extend(['figure', 'picture', 'source'])
        MANUALLY_STRIPPED.append('img')
    for expression in MANUALLY_CLEANED:
        for element in tree.getiterator(expression):
            try:
                element.drop_tree()
            except AttributeError:
コード例 #31
0
    telefono = "".join(links[1].text_content().split())
    fax = "".join(links[2].text_content().split())
    
    if len(links[3].cssselect("a")[0].attrib['href'])> len('http://'):

        web = links[3].cssselect("a")[0].attrib['href']

    else:
    
        web = ""

    return direccion, telefono, fax, web
    

cleaner = Cleaner()
cleaner.kill_tags = ['strong']

for i in range(1,45):
    base_url = 'http://planetafan.com/cas/site/tiendas.asp?prov=0&loc=0&pag='+str(i)
    
    html = scraperwiki.scrape(base_url)
    root = lxml.html.fromstring(html)
    links = root.cssselect("ul#listado-productos li")
    
    for link in links:

        record = {}

        name = link.cssselect("a")[0].text_content()
        card_link = link.cssselect("a")[0].attrib['href']
        address = link.cssselect("p")[0].text_content()
コード例 #32
0
ファイル: settings.py プロジェクト: vishalbelsare/htmldate
# earliest possible year to take into account (inclusive)
MIN_DATE = datetime.date(1995, 1, 1)
MIN_YEAR = MIN_DATE.year
# latest possible date
LATEST_POSSIBLE = datetime.date.today()
# latest possible year
MAX_YEAR = LATEST_POSSIBLE.year

# set an upper limit to the number of candidates
MAX_POSSIBLE_CANDIDATES = 150

# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.comments = False
HTML_CLEANER.embedded = True
HTML_CLEANER.forms = False
HTML_CLEANER.frames = True
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = True
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = True
HTML_CLEANER.kill_tags = [
    'applet', 'audio', 'canvas', 'datalist', 'embed', 'figure', 'label', 'map',
    'math', 'object', 'picture', 'rdf', 'svg', 'video'
]
コード例 #33
0
    def get_context_data(self, **kwargs):
        headers = cache.get(self.object.id, version="email-header")
        if headers is None:
            headers = models.Header.objects.filter(part__email=self.object, part__parent=None)
            headers = headers.get_many("Subject", "From")

        email_dict = {}
        email_dict["subject"] = headers.get("Subject", '(No subject)')
        email_dict["from"] = headers["From"]
        email_dict["date"] = self.object.received_date
        email_dict["inbox"] = self.object.inbox
        email_dict["eid"] = self.object.eid

        # iterate over MIME parts
        html = None
        plain = None
        attachments = []
        for part in self.object.parts.all():
            part_head = part.header_set.get_many("Content-Type", "Content-Disposition")
            part_head["content_type"] = part_head.pop("Content-Type", "").split(";", 1)
            dispos = part_head.pop("Content-Disposition", "")

            if part_head["content_type"][0].startswith("multipart") or part_head["content_type"][0].startswith("message"):
                continue

            try:
                params = dict(HEADER_PARAMS.findall(part_head["content_type"][1]))
            except IndexError:
                params = {}
            params.update(dict(HEADER_PARAMS.findall(dispos)))

            # find filename, could be anywhere
            if "filename" in params:
                part_head["filename"] = params["filename"]
            elif "name" in params:
                part_head["filename"] = params["name"]
            else:
                part_head["filename"] = ""

            # grab charset
            part.charset = params.get("charset", "utf-8")

            if html is None and part_head["content_type"][0] == "text/html":
                html = part
            elif plain is None and part_head["content_type"][0] == "text/plain":
                plain = part

            attachments.append((part, part_head))

        # set raw body
        plain_message = self.find_body(html, plain)
        if plain_message is None:
            if len(attachments) == 1:
                email_dict["body"] = str(attachments[0][0].body.data)
                email_dict["charset"] = attachments[0][0].charset
            else:
                email_dict["body"] = ""
                email_dict["charset"] = "utf-8"
            plain_message = True
        elif plain_message:
            email_dict["body"] = str(plain.body.data)
            email_dict["charset"] = plain.charset
        else:
            email_dict["body"] = str(html.body.data)
            email_dict["charset"] = html.charset

        if not plain_message:
            # Mail Pile uses this, give back if you come up with something better
            cleaner = Cleaner(page_structure=True, meta=True, links=True,
                       javascript=True, scripts=True, frames=True,
                       embedded=True, safe_attrs_only=True)
            cleaner.kill_tags = [
                        "style", # remove style tags, not attrs
                        "base",
                        ]

            try:
                email_dict["body"] = Premailer(email_dict["body"]).transform()
            except Exception:
                # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything
                messages.warning(self.request, _("Part of this message could not be parsed - it may not display correctly"))

            try:
                email_dict["body"] = cleaner.clean_html(email_dict["body"])
            except (etree.LxmlError, ValueError):
                if plain is not None and len(plain.body.data) > 0:
                    email_dict["body"] = str(plain.body.data)
                    email_dict["charset"] = plain.charset
                else:
                    email_dict["body"] = ""
                    email_dict["charset"] = "utf-8"

                plain_message = True
                messages.error(self.request, _("This email contained invalid HTML and could not be displayed"))

        self.headline = email_dict["subject"]

        # GET params for users with `ask_image` set in their profile
        if plain_message:
            # bypass image scrubber
            img_display = True
            ask_images = False
        elif "imgDisplay" in self.request.GET and int(self.request.GET["imgDisplay"]) == 1:
            img_display = True
            ask_images = False
        elif self.request.user.userprofile.flags.ask_images:
            img_display = False
            ask_images = True
        else:
            img_display = self.request.user.userprofile.flags.display_images
            ask_images = False

        # filter images if we need to
        if not img_display:
            try:
                tree = lxml_html.fromstring(email_dict["body"])
                for img in tree.findall(".//img"):
                    try:
                        del img.attrib["src"]
                    except KeyError:
                        pass
                email_dict["body"] = etree.tostring(tree)
            except (etree.LxmlError, ValueError):
                if plain is not None and len(plain.body.data) > 0:
                    email_dict["body"] = str(plain.body.data)
                    email_dict["charset"] = plain.charset
                else:
                    email_dict["body"] = ""
                    email_dict["charset"] = "utf-8"

        # convert to unicode as late as possible
        email_dict["body"] = unicode(email_dict["body"], email_dict["charset"], errors="replace")

        context = super(EmailView, self).get_context_data(**kwargs)
        context.update({
                        "email": email_dict,
                        "plain_message": plain_message,
                        "attachments": attachments,
                        "ask_images": ask_images,
                        })

        return context
コード例 #34
0
from lxml.html import fragments_fromstring
from PIL import Image, ImageDraw, ImageFont

from rs_mailer import EmailSender
import base64
import tempfile

import requests
from readability import Document
from lxml.html.clean import Cleaner

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.remove_tags = ['div', 'span']
cleaner.kill_tags = ['svg']

# This script will create an opf version of The Guardian (or The
# Observer on Sunday) suitable for turning into a .mobi file for
# copying to your Kindle.
blacklisted_section_names = ['pictures']

get_paper_articles = False

email_send = False

sleep_seconds_after_api_call = 2

# Check the path of the directory where this script is located
# to read keys and config files 
# (Ignore symbolic links)
コード例 #35
0
ファイル: settings.py プロジェクト: LukasBBAW/trafilatura-1
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False  # True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False  # True
HTML_CLEANER.style = False
# HTML_CLEANER.remove_tags = ['a', 'abbr', 'acronym', 'address', 'big', 'cite', 'dd', 'font', 'ins', 'meta', 'span', 'small', 'sub', 'sup', 'wbr'] #  'center', 'table', 'tbody', 'td', 'th', 'tr',
HTML_CLEANER.remove_tags = ['img']
HTML_CLEANER.kill_tags = ['aside', 'del']
# 'area', 'table' # 'header'

CUT_EMPTY_ELEMS = {
    'article', 'b', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 'li',
    'main', 'p', 'section', 'span', 'strong', 'td'
}
# 'meta',

MANUALLY_CLEANED = [
    'audio', 'blink', 'button', 'canvas', 'embed', 'figure', 'footer', 'form',
    'head', 'iframe', 'input', 'link', 'map', 'marquee', 'math', 'nav',
    'noscript', 'object', 'picture', 'script', 'style', 'svg', 'time', 'video'
]
# 'frame' 'frameset' 'source', 'img',
コード例 #36
0
    telefono = "".join(links[1].text_content().split())
    fax = "".join(links[2].text_content().split())

    if len(links[3].cssselect("a")[0].attrib['href']) > len('http://'):

        web = links[3].cssselect("a")[0].attrib['href']

    else:

        web = ""

    return direccion, telefono, fax, web


cleaner = Cleaner()
cleaner.kill_tags = ['strong']

for i in range(1, 45):
    base_url = 'http://planetafan.com/cas/site/tiendas.asp?prov=0&loc=0&pag=' + str(
        i)

    html = scraperwiki.scrape(base_url)
    root = lxml.html.fromstring(html)
    links = root.cssselect("ul#listado-productos li")

    for link in links:

        record = {}

        name = link.cssselect("a")[0].text_content()
        card_link = link.cssselect("a")[0].attrib['href']
コード例 #37
0
MIN_FILE_SIZE = 10

# Plausible dates
# earliest possible year to take into account (inclusive)
MIN_YEAR = 1995
# latest possible date
LATEST_POSSIBLE = datetime.date.today()
# latest possible year
MAX_YEAR = datetime.date.today().year

# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.comments = False
HTML_CLEANER.embedded = True
HTML_CLEANER.forms = False
HTML_CLEANER.frames = True
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = True
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = True
HTML_CLEANER.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf',
    'svg', 'video'
]
# 'embed', 'figure', 'img', 'table'