Ejemplo n.º 1
0
 def test_doc_no_scripts_styles(self):
     """Step #1 remove all scripts from the document"""
     doc = Article(load_snippet('document_scripts.html'))
     readable = doc.readable_dom
     self.assertEqual(readable.findall(".//script"), [])
     self.assertEqual(readable.findall(".//style"), [])
     self.assertEqual(readable.findall(".//link"), [])
Ejemplo n.º 2
0
def getData(source_url):
    if not source_url.startswith('//') and '://' not in source_url:
        source_url = 'http://' + source_url

    try:
        html = get(source_url,
                   headers={
                       'User-Agent': 'Computer Club Plaintext Reading Plugin'
                   }).text
    except:
        return None, None, None

    soup = BeautifulSoup(html, 'lxml')

    try:
        header = soup.find('h1').text.strip()
    except:
        header = None

    try:
        title = soup.find('title').text.strip()
    except:
        title = None

    readable = Article(html, url=source_url).readable
    soup = BeautifulSoup(readable, 'lxml')

    if not soup.find('div', {'id': 'readabilityBody'}).text.strip():
        readable = None

    return title, header, readable
Ejemplo n.º 3
0
def test_no_content():
    """Without content we supply an empty unparsed doc."""
    doc = Article('')

    assert doc.readable_dom.tag == 'div'
    assert doc.readable_dom.get('id') == 'readabilityBody'
    assert doc.readable_dom.get('class') == 'parsing-error'
Ejemplo n.º 4
0
    def __init__(self, html_content, tokenizer, url=None):
        super(HtmlParser, self).__init__(tokenizer)
        self._article = Article(html_content, url)

        #count number of paragraphs on page
        self.desired_ct = 2
        self.paragraph_ct = 0
        self.correct_paragraph_ct = 0

        #count paragraphs
        for paragraph in self._article.main_text:
            self.paragraph_ct += 1

        #check if a lot of paragraphs, usually intro material will be in the first couple paragraphs
        if self.paragraph_ct > self.desired_ct:
            #calculate number of paragraphs to use
            self.correct_paragraph_ct = int(self.paragraph_ct * 0.3 // 10)
            #if correction results in too few paragraphs - opt to use desired number
            if self.correct_paragraph_ct <= 1:
                self.correct_paragraph_ct = self.desired_ct
        #if less than desired and more than 0, already at correct paragraph count
        elif self.paragraph_ct > 0:
            self.correct_paragraph_ct = self.paragraph_ct

        #delete excess paragraphs
        diff = self.paragraph_ct - self.correct_paragraph_ct
        while diff > 0:
            del self._article.main_text[diff + self.correct_paragraph_ct - 1]
            diff -= 1
Ejemplo n.º 5
0
    def test_candidates(self):
        """Verify we have candidates."""
        doc = Article(self.article)
        # from lxml.etree import tounicode
        found = False
        wanted_hash = '04e46055'
        # from breadability.logconfig import LNODE
        # from breadability.logconfig import set_logging_level
        # set_logging_level('DEBUG')
        # LNODE.activate()
        for node in doc.candidates.values():
            if node.hash_id == wanted_hash:
                found = node

        self.assertTrue(found)

        # we have the right node, it must be deleted for some reason if it's
        # not still there when we need it to be.
        # Make sure it's not in our to drop list.
        for node in doc._should_drop:
            self.assertFalse(node == found.node)

        by_score = sorted([c for c in doc.candidates.values()],
                          key=attrgetter('content_score'),
                          reverse=True)
        self.assertTrue(by_score[0].node == found.node)

        updated_winner = check_siblings(by_score[0], doc.candidates)
        updated_winner.node = prep_article(updated_winner.node)
Ejemplo n.º 6
0
 def test_content_exists(self):
     """Verify that some content exists."""
     doc = Article(self.article)
     self.assertTrue('Amazon and Google' in doc.readable)
     self.assertFalse('Linkblog updated' in doc.readable)
     self.assertFalse(
         '#anExampleGoogleDoesntIntendToShareBlogAndItWill' in doc.readable)
Ejemplo n.º 7
0
def test_unlikely_hits():
    """Verify we wipe out things from our unlikely list."""
    doc = Article(load_snippet('test_readable_unlikely.html'))
    readable = doc.readable_dom
    must_not_appear = [
        'comment', 'community', 'disqus', 'extra', 'foot', 'header', 'menu',
        'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break',
        'agegate', 'pagination'
        '', 'pager', 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'
    ]

    want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']

    for i in must_not_appear:
        # we cannot find any class or id with this value
        by_class = readable.find_class(i)

        for test in by_class:
            # if it's here it cannot have the must not class without the
            # want to appear class
            found = False
            for cls in test.get('class').split():
                if cls in want_to_appear:
                    found = True
            assert found

        by_ids = readable.get_element_by_id(i, False)
        if by_ids is not False:
            found = False
            for ids in test.get('id').split():
                if ids in want_to_appear:
                    found = True
            assert found
Ejemplo n.º 8
0
def test_doc_no_scripts_styles():
    """Step #1 remove all scripts from the document"""
    doc = Article(load_snippet('document_scripts.html'))
    readable = doc.readable_dom

    assert readable.findall(".//script") == []
    assert readable.findall(".//style") == []
    assert readable.findall(".//link") == []
Ejemplo n.º 9
0
def article():
    """Load up the article for us"""
    article_path = os.path.join(os.path.dirname(__file__), 'article.html')
    with open(article_path, "rb") as file:
        return Article(
            file.read(),
            "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8"
        )
Ejemplo n.º 10
0
    def test_beta_removed(self):
        """The id=beta element should be removed

        It's link heavy and causing a lot of garbage content. This should be
        removed.

        """
        doc = Article(self.article)
        self.assertTrue('id="beta"' not in doc.readable)
Ejemplo n.º 11
0
    def test_body_doesnt_exist(self):
        """If we can't find a body, then we create one.

        We build our doc around the rest of the html we parsed.

        """
        doc = Article(load_snippet('document_no_body.html'))
        self.assertEqual(doc.readable_dom.tag, 'div')
        self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
Ejemplo n.º 12
0
 def test_images_preserved(self):
     """The div with the comments should be removed."""
     doc = Article(self.article)
     self.assertTrue(
         'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg'
         in doc.readable)
     self.assertTrue(
         'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg'
         in doc.readable)
Ejemplo n.º 13
0
    def test_bare_content(self):
        """If the document is just pure content, no html tags we should be ok

        We build our doc around the rest of the html we parsed.

        """
        doc = Article(load_snippet('document_only_content.html'))
        self.assertEqual(doc.readable_dom.tag, 'div')
        self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
Ejemplo n.º 14
0
    def test_find_body_exists(self):
        """If the document has a body, we store that as the readable html

        No sense processing anything other than the body content.

        """
        doc = Article(load_snippet('document_min.html'))
        self.assertEqual(doc.readable_dom.tag, 'div')
        self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
Ejemplo n.º 15
0
def test_one_annotation():
    article = Article(
        "<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
    annotated_text = article.main_text

    assert annotated_text == [(
        ("This is text\nwith", None),
        ("no", ("del", )),
        ("annotations", None),
    )]
Ejemplo n.º 16
0
def extract_html(content):
    article = Article(content)
    annotated_text = article.main_text
    paragraphs = ""
    for paragraph in annotated_text:
        sentences = ""
        for text, annotations in paragraph:
            sentences += text
        paragraphs += sentences
    return paragraphs
Ejemplo n.º 17
0
    def test_one_annotation(self):
        article = Article(
            "<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
        annotated_text = article.main_text

        expected = [(
            ("This is text\nwith", None),
            ("no", ("del", )),
            ("annotations", None),
        )]
        self.assertEqual(annotated_text, expected)
Ejemplo n.º 18
0
    def parse(content, content_type=None, url=None):
        """Handle the parsing out of the html content given"""
        read = Readable()
        document = Article(content.read(), url=url)

        if not document.readable:
            read.error(STATUS_CODES['900'], "Could not parse content.")
        else:
            read.set_content(document.readable, content_type=content_type)
            read.status = STATUS_CODES['1']
        return read
def createAudioFile(filename : str, url : str, speed : int = 200):

	# check that the url is valid
	try:
		response = requests.get(url)
		if response.status_code is not 200:
			return "Unable to get page " + url
	except:
		return "Unable to get page " + url 

	# now actually get the response
	response = requests.get(url)

	# fix filename if needed for pyttsx3
	if len(filename) < 5:
		filename = filename + ".mp3"
	elif filename[-4] is not ".mp3":
		filename = filename + ".mp3"

	# check that the filename/location is valid
	# This is really not the pythonic solution, HOWEVER
	# pyttsx3 seems to provide NO error checking or anything
	directory = os.path.abspath(os.path.dirname(filename))
	if not os.access(directory, os.W_OK):
		return "Unable to write to directory/file"

	# check for existance	
	if os.path.exists(filename):
		return "File already exists"

	# find the important stuff with readability
	doc = Article(response.content, url=url)

	# isolate text with soup
	soup = BeautifulSoup(doc.readable)
	text = soup.text

	# init speech engine
	engine = pyttsx3.init()
    # set speed
	engine.setProperty('rate', speed)
    # write to the file location
	engine.save_to_file(text, filename)
    # run
	engine.runAndWait()
	engine.stop()

    # return to make sure it was completed
	return "Wrote audio to " + filename
Ejemplo n.º 20
0
def processHtml(html):
    """
    处理html语言
    :param html:
    :return:
    """
    _article = Article(html)
    annotated_text = _article.main_text
    sentences = []
    for paragraph in annotated_text:
        current_text = ""
        for text, annotations in paragraph:
            current_text += " " + text
        sentences.append(current_text)
    return "".join(sentences)
Ejemplo n.º 21
0
def extract_html(content,is_content=True):
    if is_content:
        article = Article(content)
        annotated_text=article.main_text
    else:
        annotated_text=[((content,None),)]
    paragraphs=""
    split_sent=['。','?','!','!','?']

    #  将双分号里面的句子不进行分割。
    Dquotes=['"','“','”']
    for paragraph in annotated_text:
        sentences=""
        for text, annotations in paragraph:
            sentences+=text
        sentences = re.sub("((.*?))?(\(.+\))?(编译.+)?(责编:.+)?", "", sentences)
        sentences = re.sub("(本文系版权作品,未经授权严禁转载。.*)\s?(责编)?", "", sentences)
        sentences = re.sub("\d", "#", sentences)
        sentences=" ".join(jieba.cut(sentences))
        if len(sentences)==0:
            continue
        quote=False
        newsentences=""
        newsentences+=" "+PARAGRAPH_START+" "+SENTENCE_START+" "
        for word in sentences:
            if word in Dquotes and not quote:
                quote=True
                newsentences+=word
            elif word in Dquotes and quote:
                quote=False
                newsentences+=word
            elif quote:
                newsentences+=word
            elif word in split_sent and not quote:
                newsentences+=word
                newsentences+=" "+SENTENCE_END+" "
                newsentences+=SENTENCE_START+" "
            else:
                newsentences+=word
        if len(newsentences)-newsentences.rfind(SENTENCE_START+" ")==4:
            newsentences=newsentences[:-len(SENTENCE_START+" ")]
        else:
            newsentences+=" "+SENTENCE_END
        newsentences+=" "+PARAGRAPH_END
        paragraphs+=newsentences
    return paragraphs
Ejemplo n.º 22
0
def test_simple_snippet():
    snippet = Article(load_snippet("annotated_1.html"))
    annotated_text = snippet.main_text

    assert annotated_text == [(
        ("Paragraph is more", None),
        ("better", ("em", )),
        (".\nThis text is very", None),
        ("pretty", ("strong", )),
        ("'cause she's girl.", None),
    ),
                              (
                                  ("This is not", None),
                                  ("crap", ("big", )),
                                  ("so", None),
                                  ("readability", ("dfn", )),
                                  ("me :)", None),
                              )]
Ejemplo n.º 23
0
def test_real_article():
    article = Article(load_article("zdrojak_automaticke_zabezpeceni.html"))
    annotated_text = article.main_text

    assert annotated_text == [
        (
            ("Automatické zabezpečení", ("h1", )),
            ("Úroveň zabezpečení aplikace bych rozdělil do tří úrovní:", None),
        ),
        (
            ("Aplikace zabezpečená není, neošetřuje uživatelské vstupy ani své výstupy.",
             ("li", "ol")),
            ("Aplikace se o zabezpečení snaží, ale takovým způsobem, že na ně lze zapomenout.",
             ("li", "ol")),
            ("Aplikace se o zabezpečení stará sama, prakticky se nedá udělat chyba.",
             ("li", "ol")),
        ),
        (("Jak se tyto úrovně projevují v jednotlivých oblastech?", None), ),
        (
            ("XSS", ("a", "h2")),
            ("Druhou úroveň představuje ruční ošetřování pomocí", None),
            ("htmlspecialchars", ("a", "kbd")),
            (". Třetí úroveň zdánlivě reprezentuje automatické ošetřování v šablonách, např. v",
             None),
            ("Nette Latte", ("a", "strong")),
            (". Proč píšu zdánlivě? Problém je v tom, že ošetření se dá obvykle snadno zakázat, např. v Latte pomocí",
             None),
            ("{!$var}", ("code", )),
            (". Viděl jsem šablony plné vykřičníků i na místech, kde být neměly. Autor to vysvětlil tak, že psaní",
             None),
            ("{$var}", ("code", )),
            ("někde způsobovalo problémy, které po přidání vykřičníku zmizely, tak je začal psát všude.",
             None),
        ),
        (("<?php\n$safeHtml = $texy->process($content_texy);\n$content = Html::el()->setHtml($safeHtml);\n// v šabloně pak můžeme použít {$content}\n?>",
          ("pre", )), ),
        (
            ("Ideální by bylo, když by už samotná metoda", None),
            ("process()", ("code", )),
            ("vracela instanci", None),
            ("Html", ("code", )),
            (".", None),
        ),
    ]
Ejemplo n.º 24
0
def extract(html, **kwargs):
    """ Extract an article from given URL

    Example::

        >>> from artexin.fetch import fetch_content
        >>> c = fetch_content('http://hetland.org/writing/instant-hacking.html')
        >>> t, s = extract(c)
        >>> 'What is Programming?' in s
        True
        >>> '<a href="./../research">Research</a>' in s
        False
        >>> '<div id="navigation">' in s
        False

    :param html:        String containing the HTML document
    :param **kwargs:    Extra arguments for readability's ``Document()`` class
    :returns:           Two-tuple containing document title and article body
    """
    # Extract article
    soup = BeautifulSoup(html, 'lxml')
    title_text = get_title(soup)

    doc = Article(html, return_fragment=False, **kwargs)

    # Create basic <head> tag with <title> and charset tags
    clean_html = doc.readable
    soup = BeautifulSoup(clean_html, 'lxml')
    head = soup.new_tag('head')
    title = soup.new_tag('title')
    title.string = title_text
    meta_charset = soup.new_tag('meta', charset='utf-8')
    meta_equiv = soup.new_tag('meta', content="text/html; charset='utf-8'")
    meta_equiv['name'] = 'http-equiv'  # new_tag() doesn't allow 'name' kwarg
    soup.html.insert(0, head)
    soup.head.append(meta_charset)
    soup.head.append(meta_equiv)
    soup.head.append(title)

    # Add doctype
    final = '<!DOCTYPE html>\n' + soup.prettify()
    return (title_text, final)
Ejemplo n.º 25
0
def main():
    args = parse_args()

    if args.verbose:
        set_logging_level('DEBUG')

    if args.debug:
        LNODE.activate()

    target = args.path[0]
    LOG.debug("Target: " + target)

    if target.startswith('http') or target.startswith('www'):
        is_url = True
        url = target
    else:
        is_url = False
        url = None

    if is_url:
        req = urllib.urlopen(target)
        content = req.read()
        ucontent = unicode(content, 'utf-8')
    else:
        ucontent = codecs.open(target, "r", "utf-8").read()

    doc = Article(ucontent, url=url, fragment=args.fragment)
    if args.browser:
        fg, pathname = mkstemp(suffix='.html')
        out = codecs.open(pathname, 'w', 'utf-8')
        out.write(doc.readable)
        out.close()
        webbrowser.open(pathname)
    else:
        # Wrap sys.stdout into a StreamWriter to allow writing unicode.
        sys.stdout = codecs.getwriter(
                        locale.getpreferredencoding())(sys.stdout)
        sys.stdout.write(doc.readable)
Ejemplo n.º 26
0
def article():
    """Load up the article for us"""
    article_path = os.path.join(os.path.dirname(__file__), 'article.html')
    with open(article_path) as file:
        return Article(file.read())
Ejemplo n.º 27
0
 def __init__(self, html_content, tokenizer, url=None):
     super(HtmlParser, self).__init__(tokenizer)
     self._article = Article(html_content, url)
Ejemplo n.º 28
0
def test_title_loads():
    """Verify we can fetch the title of the parsed article"""
    doc = Article(load_snippet('document_min.html'))

    assert doc._original_document.title == 'Min Document Title'
Ejemplo n.º 29
0
def test_no_annotations():
    article = Article("<div><p>This is text with no annotations</p></div>")
    annotated_text = article.main_text

    assert annotated_text == [(("This is text with no annotations", None), )]
Ejemplo n.º 30
0
def test_empty():
    article = Article("")
    annotated_text = article.main_text

    assert annotated_text == []