Esempio n. 1
0
def load(request):
    with urlopen(
            'https://secure.toronto.ca/cc_sr_v1/data/swm_waste_wizard_APR?limit=1000'
    ) as response:
        data = json.loads(response.read().decode())

    cleaner = Cleaner()
    cleaner.remove_tags = ['span']

    item: dict
    for item in data:
        to_be_stored_body = html.unescape(item['body'])
        if '<ul' not in to_be_stored_body:
            to_be_stored_body = '<ul><li>' + to_be_stored_body + '</li></ul>'
        to_be_stored_body = cleaner.clean_html(to_be_stored_body)
        if not Item.objects.filter(
                body=to_be_stored_body).count():  # Only load if body is unique
            i = Item(body=to_be_stored_body,
                     category=item['category'],
                     title=item['title'],
                     keywords=item['keywords'])
            if 'id' in item.keys(
            ):  # Some items have an ID, load them if needed
                i.opt_id = item['id']
            i.save()

    return HttpResponse("Loaded items from JSON. Current item count: " +
                        str(Item.objects.count()))
Esempio n. 2
0
    def crawl(self):
        # count starts at first page
        crawling = True
        count = 0
        time.sleep(5)
        while crawling:
            searchterm = self.searchterm
            city = self.city
            prov = self.province
            # url = "http://ca.indeed.com/jobs?q="+searchterm+'&l='+city+"%2C+"+prov+'&start='+str(count)
            url = "http://ca.indeed.com/jobs?q={0}&l=+{1}+%2C{2}&start={3}".format(
                searchterm, city, prov, str(count))
            print(url, 'current URL')
            page = requests.get(url)
            tree = html.fromstring(page.text)
            # cleans html by removing <b></b> tags in the description
            # These tags caused a bug where the descriptions were fragmented on multiple rows
            cleaner = Cleaner()
            cleaner.remove_tags = ['b']
            tree = cleaner.clean_html(tree)
            jobtitles = tree.xpath('//h2[@class="jobtitle"]/a/text()')
            joblinks = tree.xpath('//h2[@class="jobtitle"]/a/@href')
            job_descriptions = tree.xpath('//span[@class="summary"]/text()')
            jobtitles = (job.lstrip() for job in jobtitles)
            joblinks = (job.lstrip() for job in joblinks)
            job_descriptions = (job for job in job_descriptions)
            Database.add_entry(zip(jobtitles, joblinks, job_descriptions))
            link_pages = tree.xpath('//div[@class="pagination"]/a/@href')
            print(link_pages, 'link_pages')
            # look for next button
            # if no longer present it means we have reached the last page
            next_button = tree.xpath(
                '//*[@id="resultsCol"]/div/a/span/span/text()')
            next_button_str = ''.join(next_button)
            print(next_button)

            if u'Next' in next_button_str:
                print('found next will continue scraping...')
            else:
                print('Hit last page, crawler will stop...')
                crawling = False

            for page in link_pages:
                # takes digits from end of url
                # takes last 6 characters, unlikely that the number would be any bigger
                p = page[-6:]
                digits_url = ''.join([d for d in p if d.isdigit()])
                try:
                    print(digits_url, 'digits url')
                    if digits_url > count:
                        print(page, 'page')
                        count = int(digits_url)
                        print(count, 'count')
                    else:
                        print(
                            'You probably broke your conditional statement...')
                        print(digits_url, 'current count {}'.format(count))
                except ValueError:
                    # print("We're on the first page so no int in the page url")
                    print('This failed', digits_url)
Esempio n. 3
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
Esempio n. 4
0
    def clearTag_old(self, text: str) -> str:
        import lxml
        from lxml.html.clean import Cleaner

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.links = True
        cleaner.meta = True
        cleaner.forms = True
        cleaner.embedded = True
        cleaner.frames = True
        cleaner.remove_unknown_tags = True
        cleaner.kill_tags = ["img"]
        cleaner.remove_tags = [
            "strong",
            "div",
            "body",
            "br",
            "a",
            "p",
            "blockquote",
            "h3",
            "ol",
            "li",
            "font",
        ]
        return cleaner.clean_html(
            lxml.html.document_fromstring(text)).decode("utf-8")
Esempio n. 5
0
 def _get_cleaner(self, print_style, print_js, remove_tags):
     c = Cleaner()
     c.scripts = not print_js
     c.javascript = not print_js
     c.style = not print_style
     c.remove_tags = remove_tags
     c.page_structure = False
     return c
Esempio n. 6
0
 def create_html_cleaner(self):
     cleaner = Cleaner()
     cleaner.javascript = True
     cleaner.style = True
     cleaner.remove_tags = [
         'br', 'hr', 'img', 'basefont', 'area', 'base', 'col', 'embed',
         'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'
     ]
     return cleaner
Esempio n. 7
0
    def crawl(self):
        # count starts at first page
        crawling = True
        count = 0
        time.sleep(5)
        while crawling:
            searchterm = self.searchterm
            city = self.city
            prov = self.province
            # url = "http://ca.indeed.com/jobs?q="+searchterm+'&l='+city+"%2C+"+prov+'&start='+str(count)
            url = "http://ca.indeed.com/jobs?q={0}&l=+{1}+%2C{2}&start={3}".format(searchterm, city, prov, str(count))
            print(url, 'current URL')
            page = requests.get(url)
            tree = html.fromstring(page.text)
            # cleans html by removing <b></b> tags in the description
            # These tags caused a bug where the descriptions were fragmented on multiple rows
            cleaner = Cleaner()
            cleaner.remove_tags = ['b']
            tree = cleaner.clean_html(tree)
            jobtitles = tree.xpath('//h2[@class="jobtitle"]/a/text()')
            joblinks = tree.xpath('//h2[@class="jobtitle"]/a/@href')
            job_descriptions = tree.xpath('//span[@class="summary"]/text()')
            jobtitles = (job.lstrip() for job in jobtitles)
            joblinks = (job.lstrip() for job in joblinks)
            job_descriptions = (job for job in job_descriptions)
            Database.add_entry(zip(jobtitles, joblinks, job_descriptions))
            link_pages = tree.xpath('//div[@class="pagination"]/a/@href')
            print(link_pages, 'link_pages')
            # look for next button
            # if no longer present it means we have reached the last page
            next_button = tree.xpath('//*[@id="resultsCol"]/div/a/span/span/text()')
            next_button_str = ''.join(next_button)
            print(next_button)

            if u'Next' in next_button_str:
                print('found next will continue scraping...')
            else:
                print('Hit last page, crawler will stop...')
                crawling = False

            for page in link_pages:
                # takes digits from end of url
                # takes last 6 characters, unlikely that the number would be any bigger
                p = page[-6:]
                digits_url = ''.join([d for d in p if d.isdigit()])
                try:
                    print(digits_url, 'digits url')
                    if digits_url > count:
                        print(page, 'page')
                        count = int(digits_url)
                        print(count, 'count')
                    else:
                        print('You probably broke your conditional statement...')
                        print(digits_url, 'current count {}'.format(count))
                except ValueError:
                    # print("We're on the first page so no int in the page url")
                    print('This failed', digits_url)
Esempio n. 8
0
    def cleaner_li(self):

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.meta = True
        cleaner.safe_attrs_only = True
        cleaner.remove_tags = ['i', 'span', 'b', 'li']
        cleaner.safe_attrs = ['href']

        return cleaner
Esempio n. 9
0
    def validate(self, data):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content()

        if data["qty"] < 0:
            data["qty"] = 0
        return data
def get_cleaner():
    cleaner = Cleaner()
    cleaner.embedded = True
    cleaner.frames = True
    cleaner.style = True
    cleaner.remove_unknown_tags = True
    cleaner.processing_instructions = True
    cleaner.annoying_tags = True
    cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p']
    cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul',
                         'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub']
    return cleaner
Esempio n. 11
0
def filter_html(html):
    cleaner = Cleaner(javascript=True,scripts=True,style=True,embedded=False,remove_unknown_tags=True)
    cleaner.remove_tags = ['div','font','strong','u','em','b']
    html = cleaner.clean_html(html)
    tree = lxml.html.fromstring(html)
    links = tree.xpath('//a')

    for a in links:
        a.set('rel','nofollow')
        a.set('target','_blank')

    html = lxml.html.tostring(tree)
    return html
Esempio n. 12
0
	def clean(self):
		cleaner= Cleaner(page_structure=False)
		cleaner.javascript = True
		cleaner.scripts = True
		cleaner.frames = True
		cleaner.allow_tags = []
		cleaner.remove_tags = ['p', 'div', 'a']
		self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
		self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content()
		self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content()
		self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content()
		self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content()
		self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content()
		self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()		
Esempio n. 13
0
    def validate(self, value):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content()
        data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content()
        data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content()

        # 		data['username']=  cleaner.clean_html(data['username'])
        #               data['storename']= cleaner.clean_html(data['storename'])
        #              data['email']= cleaner.clean_html(data['email'])

        return data
Esempio n. 14
0
def filter_html(html):
    cleaner = Cleaner(javascript=True,
                      scripts=True,
                      style=True,
                      embedded=False,
                      remove_unknown_tags=True)
    cleaner.remove_tags = ['div', 'font', 'strong', 'u', 'em', 'b']
    html = cleaner.clean_html(html)
    tree = lxml.html.fromstring(html)
    links = tree.xpath('//a')

    for a in links:
        a.set('rel', 'nofollow')
        a.set('target', '_blank')

    html = lxml.html.tostring(tree)
    return html
Esempio n. 15
0
 def gettextonly(self, html, url):
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.style = True
     cleaner.links = True
     cleaner.meta = False
     cleaner.page_structure = False
     cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                    'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr',
                    'table', 'a', 'p', 'br', 'li', 'ul']
     doc = lxml.html.fromstring(html)
     path = '/html/body'
     try:
         body = doc.xpath(path)[0]
     except Exception as detail:
         print detail
         return False
     return cleaner.clean_html(body).text_content().split()
    def get_current_players(self, info_tree):
        table_head_pat = '//table[@id="players"]//thead//tr//th'
        # Some but not all headers have an <a> for sorting columns
        # that needs to be removed
        cleaner = Cleaner()
        cleaner.remove_tags = ['a']

        headings = []
        required_headings = {
            'Name', 'Perk', 'Dosh', 'Health', 'Kills', 'Ping', 'Admin'
        }
        for heading in info_tree.xpath(table_head_pat):
            heading = cleaner.clean_html(heading)
            headings += heading.xpath('//th/text()')

        if not required_headings.issubset(set(headings)):
            logger.error("Player is missing columns ({}) on {}".format(
                required_headings - set(headings), self.server.name))

        player_rows_pat = '//table[@id="players"]//tbody//tr'
        player_rows_tree = info_tree.xpath(player_rows_pat)

        players_table = []

        for player_row in player_rows_tree:
            values = []
            for value in player_row:
                if not value.text_content():
                    values += [None]
                else:
                    values += [value.text_content()]

            if values[0] == "There are no players":
                logger.debug("No players on server {}".format(
                    self.server.name))
            elif len(values) != len(headings):
                logger.warning("Player row ({}) length did not "
                               "match the table length on {}".format(
                                   player_row[headings.index("Name")],
                                   self.server.name))
            else:
                players_table += [values]

        return (headings, players_table)
Esempio n. 17
0
    def clean(self: T) -> str:
        cleaner = Cleaner()
        cleaner.style = self.__style
        cleaner.links = self.__links
        cleaner.page_structure = self.__page_structure
        cleaner.safe_attrs_only = self.__safe_attrs_only

        # allow_tags and remove_unknown_tags can't work together
        if self.__allow_tags is not None:
            cleaner.remove_unknown_tags = False
            cleaner.allow_tags = self.__allow_tags
        if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags
        if self.__remove_tags is not None:
            cleaner.remove_tags = self.__remove_tags
        if self.__safe_attrs is not None:
            cleaner.safe_attrs = self.__safe_attrs

        self.__input = cleaner.clean_html(self.__input)
        return self.__input
Esempio n. 18
0
def extract_content(bytehtml, doc):
    """
    extracts blog post content from html
    """
    lxmldoc = lxml.html.document_fromstring(bytehtml)
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.style = True
    #cleaner.page_structure = True
    cleaner.kill_tags = ['head', 'noscript']
    cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote']
    cleaner(lxmldoc)
    content_el = find_content_element(lxmldoc)
    if content_el:
        debug(3, 'content quality {}'.format(content_el._quality))
        text = tidy_content(content_el.text_content())
        return text
    else:
        debug(2, 'no content found!')
        raise Exception('no content')
Esempio n. 19
0
    def validate(self, data):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]

        # (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
        data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content()
        data["price"] = (lxml.html.document_fromstring(cleaner.clean_html(data["price"]))).text_content()
        data["itemid"] = (lxml.html.document_fromstring(cleaner.clean_html(data["itemid"]))).text_content()
        data["discountcode"] = (lxml.html.document_fromstring(cleaner.clean_html(data["discountcode"]))).text_content()
        data["orderdate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["orderdate"]))).text_content()
        data["selldate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["selldate"]))).text_content()
        data["page"] = (lxml.html.document_fromstring(cleaner.clean_html(data["page"]))).text_content()

        if data[qty] < 0:
            data[qty] = 0

        #           self.name= cleaner.clean_html(self.name)
        return data
Esempio n. 20
0
HTML_CLEANER.annoying_tags = True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = True
HTML_CLEANER.forms = True
HTML_CLEANER.frames = True
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = True
HTML_CLEANER.style = False
HTML_CLEANER.remove_tags = [
    'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta',
    'small', 'sub', 'sup', 'wbr'
]  #  'center', 'table', 'tbody', 'td', 'th', 'tr', 'span',
HTML_CLEANER.kill_tags = [
    'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head',
    'iframe', 'img', 'label', 'link', 'map', 'math', 'nav', 'noscript',
    'object', 'picture', 'style', 'svg', 'time', 'video'
]  # 'area', 'table' # 'header'

# validation
TEI_VALID_TAGS = set(
    ['code', 'del', 'div', 'head', 'hi', 'item', 'lb', 'list', 'p', 'quote'])
TEI_VALID_ATTRS = set(['rendition'])

# counters
tokens_posts = 0
tokens_comments = 0
Esempio n. 21
0
from functools import wraps
from flask import Flask, request, session, render_template, url_for
from flask import abort, redirect, Markup, make_response
from flask_common import Common
from names import get_full_name
from raven.contrib.flask import Sentry
from flask_qrcode import QRcode
from . import storage
from urllib.parse import quote
from lxml.html.clean import Cleaner

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.remove_tags = ['script', 'style', 'link']
cleaner.allow_attributes = ['alt', 'href']
cleaner.remove_attributes = [
    'id', 'class', 'style', 'align', 'border', 'cellpadding', 'cellspacing',
    'width', 'height', 'hspace', 'vspace', 'frameborder', 'marginwidth',
    'marginheight', 'noresize', 'scrolling', 'target', 'onclick', 'ondblclick',
    'onmousedown', 'onmousemove', 'onmouseover', 'onmouseout', 'onmouseup',
    'onkeypress', 'onkeydown', 'onkeyup', 'onblur', 'onchange', 'onfocus',
    'onselect', 'onreset', 'onsubmit', 'onabort', 'oncanplay',
    'oncanplaythrough', 'oncuechange', 'ondurationchange', 'onemptied',
    'onended', 'onloadeddata', 'onloadedmetadata', 'onloadstart', 'onpause',
    'onplay', 'onplaying', 'onprogress', 'onratechange', 'onseeked',
    'onseeking', 'onstalled', 'onsuspend', 'ontimeupdate', 'onvolumechange',
    'onwaiting'
]
Esempio n. 22
0
    stopwords.words('english')
]
exclude_list = [item for sublist in exclude for item in sublist]
exclude_list.append('')

word_list = []
with open(args.infile, 'rb') as infile:
    soup = bs(infile)
    if len(soup) > 0:
        if soup.find('title') is not None:
            title = soup.find('title').contents[0]
            print title
        body = soup.findAll('p')
        print body
        cleaner = Cleaner()
        cleaner.remove_tags = ['p']
        for x in body:
            document = lxml.html.document_fromstring(str(x))
            word_list.append(document.text_content())

        word_list = [re.sub("\\n", '', word) for word in word_list]
        word_list = [word.split(' ') for word in word_list]
        word_list = [item for sublist in word_list for item in sublist]

        wordslist2 = []
        for word in word_list:
            try:
                word = word.translate(None,
                                      string.punctuation.translate(
                                          None, '"')).lower()
                wordslist2.append(word)
Esempio n. 23
0
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False  # True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False  # True
HTML_CLEANER.style = False
# HTML_CLEANER.remove_tags = ['a', 'abbr', 'acronym', 'address', 'big', 'cite', 'dd', 'font', 'ins', 'meta', 'span', 'small', 'sub', 'sup', 'wbr'] #  'center', 'table', 'tbody', 'td', 'th', 'tr',
HTML_CLEANER.remove_tags = ['img']
HTML_CLEANER.kill_tags = ['aside', 'del']
# 'area', 'table' # 'header'

CUT_EMPTY_ELEMS = {
    'article', 'b', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 'li',
    'main', 'p', 'section', 'span', 'strong', 'td'
}
# 'meta',

MANUALLY_CLEANED = [
    'audio', 'blink', 'button', 'canvas', 'embed', 'figure', 'footer', 'form',
    'head', 'iframe', 'input', 'link', 'map', 'marquee', 'math', 'nav',
    'noscript', 'object', 'picture', 'script', 'style', 'svg', 'time', 'video'
]
# 'frame' 'frameset' 'source', 'img',
Esempio n. 24
0
def scrape(lineHashDB, html, encoding):
    # cleaner setup
    cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False)
    cleaner.javascript = True  # activate the javascript filter
    cleaner.style = True  #  activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.annoying_tags = True
    cleaner.inline_style = True
    cleaner.page_structure = False
    cleaner.remove_tags = ['b', 'a', 'h']
    cleaner.kill_tags = ['script']

    #invoke cleaner
    try:
        page = cleaner.clean_html(html)
    except:
        #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr
        content = u""
        return content

    page8 = page
    page8 = re.sub(u'\n', ' ', page8)  # remove NL
    #	page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space
    page8 = re.sub(u'&#13;', ' ', page8)  # remove CR
    page8 = re.sub(u'<!--.*?-->', ' ', page8)  # remove comments
    page8 = re.sub(u' class=".*?"', ' ', page8)  # remove attributes
    page8 = re.sub(u' id=".*?"', ' ', page8)
    page8 = re.sub(u' rel=".*?"', ' ', page8)
    page8 = re.sub(u'\[an error occurred while processing this directive\]',
                   ' ', page8)
    page8 = re.sub(u'>\s*?<', '><', page8)  # remove blanks between tags

    # cycle to remove spurious divs
    for count in range(1, 20):
        page8 = re.sub(u'>.{0,10}<', '><',
                       page8)  # remove words under 10 chars between tags
        page8 = re.sub(u'<div></div>', ' ', page8)
        page8 = re.sub(u'<p></p>', ' ', page8)
        page8 = re.sub(u'<span></span>', ' ', page8)

    page8 = re.sub(u'\s+', ' ', page8)  # remove repeated blanks

    #XPATHs
    xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()'
    xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()'

    sel = Selector(text=page8, type="html")
    text = sel.xpath(xpath).extract()
    content = u""
    if text:
        for s in text:
            # squash duplicate whitespaces
            ' '.join(s.split())
            # remove short lines
            # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc.
            if len(s) < 40:
                next
    # remove leading whitespace
    #if s.endswith(" "): s = s[:-1]
            if s.startswith(" "): s = s[1:]
            content += s
            content += "\n"
    return content
Esempio n. 25
0
                                             "articles of unnecessary stuff.")
parser.add_argument("-f","--file", help="metalink article name",required=True)
args = parser.parse_args()
filename = args.file

# file check
if not os.path.isfile(filename) and not os.access(sys.argv[1], os.R_OK):
    print "WARNING - Couldn't find specified file!"
    sys.exit(1)
elif not os.path.exists('original'):
    print 'Creating original directory for backups...'
    os.makedirs('original')

# cleaner
cleaner = Cleaner(page_structure=False)
cleaner.remove_tags = ["span"]
cleaner.kill_tags = ["script","img","style"]


# original file conversion
original = codecs.open(filename,"r","cp866")
for line in original:
    line = re.sub(r"[^\x00-\x7F]+","",line)
    #if "&nbsp;" in line:
        #line = re.sub(r"&nbsp;", "", line)
    if "&reg;" in line:
        line = line.replace("&reg;","")
    number = re.search(r"<span style=\"display:none\">\d+</span>", line)
    if number:
        line = re.sub(r"<span style=\"display:none\">\d+</span>", "", line)
    footer = re.search(r"Didn't find what you are looking for\?", line)
Esempio n. 26
0
import re
import os
import lxml
from bs4 import BeautifulSoup
from lxml.html.clean import Cleaner
from lxml.etree import XMLSyntaxError
from store_helper import StoreHelper
from text_helper import TextHelper

cleaner = Cleaner()
cleaner.javascript = True  # This is True because we want to activate the javascript filter
cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter
cleaner.inline_style = True
cleaner.whitelist_tags = set([])
cleaner.remove_tags = [
    'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2',
    'h3', 'h4', 'h5', 'span'
]
cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label']


class HTMLHelper(object):
    @staticmethod
    def remove_tag(web_source):
        text = re.sub(r'<[^>]+>', '', web_source)
        return text

    @staticmethod
    def get_text(web_source):
        try:
            _html = lxml.html.document_fromstring(web_source)
        except XMLSyntaxError:
Esempio n. 27
0
    def parse_html(self, filename):
        print(filename)
        parser = etree.HTMLParser()
        try:
            tree = html.parse(filename)
        except:
            return False, ''

        content = {}
        title = tree.xpath("//meta[@name='keywords']/@content")[0].replace(
            '- TAAZE 讀冊生活', '')
        content['title'] = title

        prod_info = tree.xpath("//li//span")
        for p in prod_info:
            #print("p={}".format(p.text))
            if p.text != None and '作者' in p.text:
                for info in p.iter('a'):
                    author = info.text.replace(
                        '/著', '').replace('/編著', '').replace('/編', '').replace(
                            '/撰文', '').replace('/總編輯', '').replace('/繪', '')
                    author = author.replace('/譯', '').replace(
                        '/小說改編',
                        '').replace('/原著劇本',
                                    '').replace('/資料提供',
                                                '').replace('/企劃主編', '')
                    author = author.replace('/改編', '').replace(
                        '/原著', '').replace('/口述',
                                           '').replace('/作', '').replace(
                                               '/繪,文', '').replace(' ', '')
                    content['author'] = author
            elif p.text != None and '譯者' in p.text:
                for info in p.iter('a'):
                    content['translator'] = info.text
            elif p.text != None and '出版社' in p.text:
                for info in p.iter('a'):
                    content['publisher'] = info.text
            elif p.text != None and '出版日期' in p.text:
                for info in p.iter('span'):
                    content['publish_date'] = info.text
            elif p.text != None and 'ISBN' in p.text:
                for info in p.iter('span'):
                    content['ISBN_no'] = info.text
            elif p.text != None and '類別' in p.text:
                content['genre'] = []
                for info in tree.xpath('//li//span/following-sibling::span/a'):
                    if info.attrib.get('class') == 'linkStyle02':
                        #print("info={}, {}".format(info.getparent().tag, info.text))
                        content['genre'].append(info.text)

        tag_info = tree.xpath("//a[@class='tag']")
        content['tag'] = []
        for p in tag_info:
            #print("p={}".format(p.text))
            content['tag'].append(p.text)

        brief_info = tree.xpath("//div[@id='prodPfDiv']")
        desc = ''
        ad_word = ['關鍵特色', '好評推薦', '作者簡介', '佳評如潮', '暢銷書', '本書特色']
        if len(brief_info):
            for child in brief_info[0]:
                if child.text != None and '作者簡介' in child.text:
                    break

                cleaner = Cleaner()
                cleaner.remove_tags = [
                    'p', 'br', 'span', 'font', 'b', 'center', 'u', 'strong'
                ]
                innertext = etree.tostring(child,
                                           encoding='unicode',
                                           method='html').replace("<div>",
                                                                  "").replace(
                                                                      "</div>",
                                                                      "")

                cleaned = cleaner.clean_html(innertext)
                if len(cleaned):
                    cleaned = cleaned.replace("<div>",
                                              "").replace("</div>", "")

                ad_exist = False
                for word in ad_word:
                    if word in cleaned:
                        ad_exist = True

                if ad_exist == True:
                    break

                desc += cleaned
        content['description'] = desc

        head, tail = os.path.split(filename)
        tazze_link = 'http://www.taaze.tw/sing.html?pid=' + tail[:-5]
        content['link'] = [tazze_link]

        #print("content={}".format(content))

        if 'ISBN_no' not in content.keys():
            content['ISBN_no'] = tail[:-5]
        filename = "{}.json".format(content['ISBN_no'])
        file_path = os.path.join(self.json_folder, filename)
        output = open(file_path, "w")
        output.write(json.dumps(content, ensure_ascii=False))
        output.close()

        return True, content['ISBN_no']
import codecs
import sys
from bs4 import BeautifulSoup
import lxml
from lxml.html.clean import Cleaner
import re
from cStringIO import StringIO
import unicodedata

reload(sys)
sys.setdefaultencoding('utf8')
cleaner = Cleaner()
cleaner.script = True  # This is True because we want to activate the javascript filter
cleaner.style = True
cleaner.kill_tags = ['a', 'img', 'href']
cleaner.remove_tags = ['div', 'span', 'li']

directory1 = "C:\Users\Satanu\html_test\\"
directory2 = "C:\Users\Satanu\text\\"
for filename in os.listdir(directory1):
    to_write = []
    html = codecs.open(directory1 + filename, 'r', 'utf-8')
    raw = lxml.html.tostring(
        cleaner.clean_html(lxml.html.parse(directory1 + filename)))
    name = filename.strip('html')

    text = codecs.open(directory2 + filename, 'w', 'utf-8')

    text.write(raw)

    soup = BeautifulSoup(raw, 'html')
Esempio n. 29
0
    def parse_books_html(self, filename):
        parser = etree.HTMLParser()
        try:
            tree = html.parse(filename)
        except:
            return False, '', False

        try:
            content = {}
            title = tree.xpath("//title")[0]
            if title == None:
                return False, '', False

            content['title'] = title.text.replace('博客來-', '')

            property_info = tree.xpath(
                "//meta[@name='description']")[0].attrib.get('content')
            property_list = property_info.split(",")
            for item in property_list:
                if 'ISBN' in item:
                    content['ISBN_no'] = item[5:]
                elif '出版社' in item:
                    content['publisher'] = item[4:]
                elif '作者' in item:
                    content['author'] = item[3:]
                elif '譯者' in item:
                    content['translator'] = item[3:]
                elif '出版日期' in item:
                    content['publish_date'] = item[5:].replace('/', '-')

            genre_info = tree.xpath(
                "//div[@class='mod_b type02_m058 clearfix']//ul[@class='sort']"
            )
            for p in genre_info:
                content['genre'] = []
                for item in p.iter('a'):
                    content['genre'].append(item.text)

            brief_info = tree.xpath("//div[@itemprop='description']")
            desc = ''
            ad_word = ['關鍵特色', '好評推薦', '作者簡介', '佳評如潮', '暢銷書', '本書特色']
            if len(brief_info):
                for child in brief_info[0]:
                    if child.text != None and '作者簡介' in child.text:
                        break
                    cleaner = Cleaner()
                    cleaner.remove_tags = [
                        'p', 'br', 'span', 'font', 'b', 'center', 'u', 'strong'
                    ]
                    innertext = etree.tostring(
                        child, encoding='unicode',
                        method='html').replace("<div>", "").replace(
                            "</div>",
                            "").replace("\u3000",
                                        '').replace('\n',
                                                    '').replace('\r', '')

                    cleaned = cleaner.clean_html(innertext)
                    if len(cleaned):
                        cleaned = cleaned.replace("<div>",
                                                  "").replace("</div>", "")

                    desc += cleaned
            content['description'] = desc

            head, tail = os.path.split(filename)
            loc_idx = tail.find('loc=')
            pid = tail[:loc_idx - 1]
            content['link'] = ['http://www.books.com.tw/products/' + tail[:-5]]
            if 'ISBN_no' not in content.keys():
                content['ISBN_no'] = pid
            #download image
            img_link = tree.xpath(
                "//meta[@property='og:image']")[0].attrib.get('content')
            r = requests.get(img_link, headers=self.header)
            image_status = False
            if r.status_code == 200:
                filename = '{}.jpg'.format(content['ISBN_no'])
                output_file = os.path.join(self.books_img_folder, filename)
                if not os.path.exists(output_file):
                    output = open(output_file, "wb")
                    output.write(r.content)
                    output.close()
                image_status = True

            filename = "{}.json".format(content['ISBN_no'])
            file_path = os.path.join(self.books_json_folder, filename)
            output = open(file_path, "w")
            output.write(json.dumps(content, ensure_ascii=False))
            output.close()

            return True, content['ISBN_no'], image_status
        except Exception as e:
            print(filename)
            print(traceback.format_exc())
            return False, '', False
Esempio n. 30
0
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    if include_tables is False:
        MANUALLY_CLEANED.append('table')
    if include_images is False:
        # Many websites have <img> inside <figure> or <picture> or <source> tag
        MANUALLY_CLEANED.extend(['figure', 'picture', 'source'])
        MANUALLY_STRIPPED.append('img')
    for expression in MANUALLY_CLEANED:
        for element in tree.getiterator(expression):
            try:
                element.drop_tree()
Esempio n. 31
0
                        stopwords.words('english')]
exclude_list = [item for sublist in exclude for item in sublist]
exclude_list.append('')


for iff in infiles:
    wordslist = []
    with open("all_data/" + iff, 'rb') as temp:
        soup = bs(temp)
        if len(soup) > 0:
            if soup.find('title') is not None:
                title = soup.find('title').contents[0]

            body = soup.findAll('p')
            cleaner = Cleaner()
            cleaner.remove_tags = ['p']
            for x in body:
                document = lxml.html.document_fromstring(str(x))
                wordslist.append(document.text_content())

            wordslist = [re.sub("\\n",'',word) for word in wordslist]
            wordslist = [word.split(' ') for word in wordslist]
            wordslist = [item for sublist in wordslist for item in sublist]
            wordslist2 = []
            for word in wordslist:
                try:
                    word = word.translate(None, string.punctuation.translate(None, '"')).lower()
                    wordslist2.append(word)
                except TypeError:
                    pass
            wordslist = [word for word in wordslist2 if word not in set(exclude_list)]
Esempio n. 32
0
def google_news_cut(link):
    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter

    page = get_web_page(link)
    soup = BeautifulSoup(page, 'html.parser')
    # all_news = soup.find_all('a', 'nuEeue hzdq5d ME7ew')
    all_news = soup.find_all('a', 'ipQwMb Q7tWef')
    key_str = ""
    titles_link = []
    word_t_list = []
    documents = []
    for news in all_news:
        # print(news.string)
        # print(news['href'])
        if re.match('\./', news['href']) is None:
            link = news['href']
        else:
            link = 'https://news.google.com/' + re.sub('\./', "", news['href'])
        titles_link.append({'title': news.string, 'link': link})
        key_str = key_str + news.string + "\n"

    remove_words = [
        'mlb', 'nba', '新聞網', '中央社', '報紙', '聯合', '時報', '全網', '自己', '中時', '年月日',
        '直播', '三立', '聞網', '使用者', '中國時報', '自由時報', '關鍵字', '網站', '發表', '留言', '發言',
        '網小時', '自由'
    ]

    jieba.load_userdict("my_dict.txt")
    jieba.load_userdict("news_dict.txt")
    jieba.analyse.set_stop_words("stop_words.txt")
    jieba.analyse.set_stop_words("stop_words_sport.txt")

    for t_link in titles_link:

        print('get_web_page: ', t_link['title'], " ", t_link['link'])
        try:
            page = get_web_page_html(t_link['link'])
            # page = get_web_page(t_link['link'])
        except requests.exceptions.SSLError:
            continue
        except lxml.etree.ParserError:
            continue
        if page is None:
            continue
        cleaner.kill_tags = ['a', 'img']
        cleaner.remove_tags = ['div', 'p']
        cleaner.remove_unknown_tags = False
        cleaner.allow_tags = ['p']
        result = html.tostring(cleaner.clean_html(page),
                               encoding="utf-8",
                               pretty_print=True,
                               method="html")
        article_content = re.sub('&#13;', "", result.decode('utf-8'))

        #
        article_content = re.sub(u'[^\u4E00-\u9FA5]', " ", article_content)
        article_content = re.sub(r'[\n\xa0\W你妳我他她它們]', "", article_content)
        article_content = re.sub('自己', "", article_content)
        # print(article_content)
        words_t = jieba.cut(article_content, cut_all=False)
        word_t_list = [word for word in words_t if word not in remove_words]
        print(word_t_list)
        documents.append(word_t_list)
    return documents
Esempio n. 33
0
import urllib2
import re

def separatewords(text):
    splitter = re.compile('\\W*')
    return [s.lower() for s in splitter.split(text) if s != '']


cleaner = Cleaner()
cleaner.scripts = True
cleaner.style = True
cleaner.links = True
cleaner.meta = False
cleaner.page_structure = False
cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                       'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr',
                       'table', 'a', 'p', 'br', 'li', 'ul']

url = 'http://www.news.mail.ru/'
c = urllib2.urlopen(url)
html = c.read()


doc = lxml.html.fromstring(html)

path = '/html/body'
body = doc.xpath(path)[0]
words =  cleaner.clean_html(body).text_content()
print words
print words.split()
print [s for s in words.split()]