Example #1
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
def htmlParser(htmlContent):
    """
    An HTML Parser for http:/abc.net.au
    """
    doc = Document(htmlContent)
    title = doc.short_title()
    simple_html = doc.summary(True)

    simple_sel = Selector(text=simple_html)

    unclean_body = '\n'.join(simple_sel.xpath('//text()').extract())
    body = _clean_body(unclean_body)

    global_sel = Selector(text=htmlContent)

    time_published = _first_post_time(global_sel)
    author = _author(global_sel)

    keywords = gen_keywords(body)

    return {
        "title": title,
        "body": body,
        "author": author,
        "timePublished": time_published,
        "keywords": keywords
    }
Example #3
0
    def preliminary_parse(self):
        if (not self.is_downloaded):
            raise Exception("not downloaded")
        try:
            d = Document(self.html)
            self._readability_title = d.short_title()
            self._readability_text = d.summary()
            logging.debug(u"readability title: {0}".format(
                repr(self._readability_title)))
            logging.debug(u"readability text: {0}".format(
                repr(self._readability_text)))
            if (self._readability_title and self._readability_text):
                self.is_parsed = True
                return True
        except Exception as e:
            logging.warning("error while doing readability parse: {0}".format(
                str(e)))
            return False

        logging.debug("falling back to newspaper parse")
        self.newspaper_article.parse()
        logging.debug(u"newspaper title: {0}".format(
            repr(self._newspaper_title)))
        logging.debug(u"newspaper text: {0}".format(repr(
            self._newspaper_text)))
        self.is_parsed = True
        return True
Example #4
0
	def parse_story(self, response):
		doc = Document(response.text)
		story = NewsItem()
		story['url'] = response.url
		story['headline'] = doc.short_title()
		story['body'] = doc.summary()
		yield story
Example #5
0
def html2text(url: str) -> str:
    request = urllib.request.Request(
        url,
        headers={
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0'
        })
    html = urllib.request.urlopen(request).read()
    doc = Document(html)
    cleaned = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary()
    soup = BeautifulSoup(cleaned, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = "\n".join(chunk for chunk in chunks if chunk)
    return text
Example #6
0
def parse_article(article_url):
    """Parse an online article."""
    source = requests.get(article_url, verify=True, timeout=2)
    urlparsed = urlparse(article_url)
    hostname = "{scheme}://{netloc}".format(scheme=urlparsed.scheme,
                                            netloc=urlparsed.netloc)
    doc = Document(source.text)

    content = bleach.clean(doc.summary(),
                           tags=ALLOWED_TAGS,
                           attributes=ALLOWED_ATTRS,
                           strip=True)
    soup = BeautifulSoup(content)

    for img in soup.findAll("img"):
        if img["src"].startswith("http"):
            continue
        img["src"] = "{root}/{src}".format(root=hostname, src=img["src"])

    for link in soup.findAll("link"):
        if link["href"].startswith("http"):
            continue
        link["href"] = "{root}/{src}".format(root=hostname, src=img["href"])

    return {
        "title": doc.short_title(),
        "content": str(soup),
        "url": article_url
    }
Example #7
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Example #8
0
 def parse(self, response):
     doc = Document(response.text)
     yield {
         'url': response.url,
         'short_title': doc.short_title(),
         'summary': doc.summary(html_partial=True),
     }
     for next_page in response.css('a::attr("href")'):
         yield response.follow(next_page, self.parse)
Example #9
0
    def load_alt_engine(self):
        response = requests.get(self.url)
        doc = Document(response.text)
        txt = doc.summary()

        soup = BeautifulSoup(txt, 'html.parser')
        self.text = soup.get_text()
        self.html = response.text
        self.title = doc.short_title()
Example #10
0
def html_filter(url, text):
    try:
        # readability
        doc = Document(text)
        title = doc.short_title()
        html_sum = doc.summary(html_partial=True)
        # pandoc
        try:
            try:
                doc = pf.convert_text(
                    html_sum,
                    input_format='html-native_divs-native_spans',
                    standalone=True)
            # strange error, pandoc and pypandoc can take this without problem.
            except OSError as e:
                print(
                    'Error {} from panflute encountered when processing {}, fallback to pypandoc.'
                    .format(e, url))
                doc = pf.convert_text(pypandoc.convert_text(
                    html_sum, 'json', 'html-native_divs-native_spans'),
                                      input_format='json',
                                      standalone=True)
            except JSONDecodeError as e:
                print(
                    'Error {} from panflute encountered when processing {}, fallback to pypandoc.'
                    .format(e, url))
                doc = pf.convert_text(
                    pypandoc.convert_text(html_sum, 'html',
                                          'html-native_divs-native_spans'),
                    input_format='html-native_divs-native_spans',
                    standalone=True)

            doc = pf.run_filters((increase_header_level, Image_to_Link),
                                 doc=doc)

            temp = pf.convert_text('''# {}\n\n[Source]({})'''.format(
                title, url))

            for item in temp[::-1]:
                doc.content.insert(0, item)

            return pf.convert_text(doc,
                                   input_format='panflute',
                                   output_format='html')
        except:
            print(
                'Cannot handle error from panflute, stop using pandoc filter on {}.'
                .format(url))
            return '<h1>{}</h1><p><a href="{}">Source</a></p>{}'.format(
                title, url, html_sum)
    except Exception as e:
        print('Cannot handle error {}. Skip processing {}.'.format(e, url))
        return ''
Example #11
0
 def parse_item(self, response):
     filename = hashlib.sha1(response.url.encode()).hexdigest()
     readability_document = Document(response.body, url=response.url)
     item = BeerReviewPage()
     item['url'] = response.url
     item['filename'] = filename
     item['depth'] = response.meta['depth']
     item['link_text'] = response.meta['link_text']
     item['title'] = readability_document.short_title()
     with open('data/' + filename + '.html','wb') as html_file:
         html_file.write(readability_document.content())
     print '(' + filename + ') ' + item['title'] + " : " + item['url']
     return item
Example #12
0
def extract_article(html, title=None):
    """
    Wraps around readability.Document and returns the articles
    title and content.
    """
    doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS)
    doc_title = doc.short_title()
    # invoke the summary method to invoke readability's magic
    doc.summary(html_partial=True)
    # obtain the article as HtmlElement tree:
    html_tree = doc.html
    # clean up the article html:
    clean_html = cleanup(html_tree, doc_title)
    # check if the outer element is a tag from negative_keywords
    if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS):
        bad_attr = True
    else:
        bad_attr = False
    if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr:
        # if so, redo extraction with min_text_length set to 0
        doc = Document(html,
                       negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS,
                       min_text_length=0)
        doc_title = doc.short_title()
        # invoke the summary method to invoke readability's magic
        doc.summary(html_partial=True)
        # obtain the article as HtmlElement tree:
        html_tree = doc.html
        # clean up the article html:
        clean_html = cleanup(html_tree, doc_title)
    content = elem_content_to_string(clean_html)
    if title:
        # if the extracted title is not a subset of given title, use
        # the given title (b/c we assume this is more accurate, but
        # maybe with some unneccessary boilerplate).
        if not doc_title in title or doc_title == '':
            doc_title = title
    return doc_title, content
Example #13
0
def HTMLparser(page, blog, url):
    title = None
    content = None
    author = None
    datePublished = None
    dateModified = None

    soup = BeautifulSoup(page, 'lxml')
    doc = Document(page)
    title = doc.short_title()
    content = BeautifulSoup(doc.summary(), 'lxml').get_text()
    try:
        application_json_ld = json.loads(
            soup.find('script', {
                'type': 'application/ld+json'
            }).get_text())
    except:
        application_json_ld = None
    if application_json_ld is not None:
        if 'author' in application_json_ld:
            if isinstance(application_json_ld['author'], list):
                author = application_json_ld['author'][0]['name']
            else:
                author = application_json_ld['author']['name']
        if 'datePublished' in application_json_ld:
            datestring = application_json_ld['datePublished']
            datePublished = parse(datestring)
        if 'dateModified' in application_json_ld:
            datestring = application_json_ld['dateModified']
            dateModified = parse(datestring)

    if blog == 'steemit':
        author = soup.find('a', {'class': 'ptc'}).get_text().split(" ")[0]
        datestring = soup.find('span',
                               {'class': 'updated'})['title'].split()[0]
        datePublished = parse(datestring)

    if len(content) < 500:
        return None

    content = content.replace('\n', '')
    return Post(meta={'id': url},
                title=title,
                content=content,
                rawContent=content,
                author=author,
                datePublished=datePublished,
                dateModified=dateModified,
                url=url)
Example #14
0
def extract_article_info(text):
    """
    Gets simplified page from the text
    Uses readability module
    """
    doc = Document(text)
    # safe fetch title
    title = doc.short_title()
    if not title:
        title = doc.title()
    # content
    content = doc.summary(html_partial=True)
    image = get_page_image(doc.content())
    # return
    return {'title': title, 'content': content, 'image': image}
Example #15
0
def clean_html():
    s = Stallion()
    # a = s.extract("https://www.rtbasia.com/")
    # a = s.extract("http://www.dytt8.net/")
    a = s.extract("http://v.pptv.com/show/fbGeHITqWpj7eeE.html")
    # response = requests.get('http://lady.163.com/19/0111/10/E57V9GIV00267VA9.html')
    # response = requests.get('http://www.rtbchina.com/')
    # response = requests.get('http://guba.eastmoney.com/news,000611,173895506.html')
    doc = Document(a.raw_html)
    # doc = Document(response.text)
    # print(doc.content())
    print(doc.short_title())
    # print(doc.title())
    # print(doc.summary())
    h = html2text.HTML2Text()
    h.ignore_links = True
    print(h.handle(doc.summary()))
Example #16
0
    def preliminary_parse(self):
        if(not self.is_downloaded):
            raise Exception("not downloaded")
        try:
            d = Document(self.html)
            self._readability_title = d.short_title()
            self._readability_text = d.summary()
            logging.debug(u"readability title: {0}".format(repr(self._readability_title)))
            logging.debug(u"readability text: {0}".format(repr(self._readability_text)))
            if(self._readability_title and self._readability_text):
                return
        except Exception as e:
            logging.warning("error while doing readability parse: {0}".format(str(e)))

        logging.debug("falling back to newspaper parse")
        self.newspaper_article.parse()
        logging.debug(u"newspaper title: {0}".format(repr(self._newspaper_title)))
        logging.debug(u"newspaper text: {0}".format(repr(self._newspaper_text)))
Example #17
0
def readability():
    import requests
    from readability import Document
    from bs4 import BeautifulSoup

    data = dict(default_data)
    data['message'] = "Article Extraction by Readability"
    data['params'] = {}
    data['error'] = ''
    data['readability'] = {}

    if request.method == 'GET':
        data['params']['url'] = request.args.get('url')
        if not data['params']['url']:
            data['error'] = '[url] parameter not found'
            return jsonify(data)

        response = requests.get(data['params']['url'])
        doc = Document(response.text)

    elif request.method == 'POST':
        params = request.form  # postdata

        if not params:
            data['error'] = 'Missing parameters'
            return jsonify(data)

        if not params['html']:
            data['error'] = 'html parameter not found'
            return jsonify(data)

        doc = Document(params['html'])

    data['readability']['title'] = doc.title()
    data['readability']['short_title'] = doc.short_title()
    #data['readability']['content'] = doc.content()
    data['readability']['article_html'] = doc.summary(html_partial=True)

    soup = BeautifulSoup(data['readability']['article_html'])
    data['readability']['text'] = soup.get_text()

    return jsonify(data)
Example #18
0
 def complement(self):
     for entry in self.entries:
         try:
             response = requests.get(entry.url, timeout=10)
         except requests.RequestException as excp:
             logger.warn('Exception requesting article %s: %s',
                         entry.url, excp.message)
             continue
         document = Document(response.content, url=response.url)
         # Image extraction first
         document._html()  # Trigger parsing
         images = document.html.xpath(
             '//meta[@property="og:image"]/@content')
         images += document.html.xpath(
             '//meta[@name="twitter:image:src"]/@content')
         # Content extraction second
         entry.url = response.url
         entry.image = (images or [''])[0]
         entry.title = document.short_title()
         entry.content = document.summary()
         yield entry
Example #19
0
    def resolve_article(self, args, context, info):
        query = Article.get_query(context)
        id = args.get("article_id")
        title = args.get("article_content")
        article = query.filter(
            or_(ArticleModel.object_id == id,
                (ArticleModel.title.like("%title%")))).first()

        response = requests.get(article.url)
        doc = Document(response.text)
        texts = pq(response.text)('body').text()
        article.updated_date = int(
            calendar.timegm(datetime.datetime.utcnow().utctimetuple()))
        article.article_view_content = str(
            render_template('body_template.html', article_content=doc.summary(True),
                            title=str(doc.short_title()),
                            article=str(doc.title()),
                            read_time=str(ReadingTime().estimate(texts, True)),
                            base_url=article.main_url, article_url=article.url)) \
            .replace("\"", "'").replace("\n", "").replace("\t", "").replace("$", "&#36;")
        return article
    def extract(self, item):
        """Creates an readability document and returns an ArticleCandidate containing article title and text.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """

        doc = Document(deepcopy(item['spider_response'].body))
        description = doc.summary()

        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name
        article_candidate.title = doc.short_title()
        article_candidate.description = description
        article_candidate.text = self._text(item)
        article_candidate.topimage = self._topimage(item)
        article_candidate.author = self._author(item)
        article_candidate.publish_date = self._publish_date(item)
        article_candidate.language = self._language(item)

        return article_candidate
Example #21
0
def parse_web_page(text):
    """
    Generic wep page parser with readability.
    Used as a fallback.

    :param text: unicode text
    :return: title, article
    :raise ParserException:
    """
    try:
        from readability import Document
        from readability.readability import Unparseable
    except ImportError:
        raise ParserException('readability is not installed')

    if not text:
        raise ParserException('No decoded text available, aborting!')
    try:
        doc = Document(text)
    except Unparseable as e:
        raise ParserException(e.message)
    else:
        return doc.short_title(), doc.summary(True)
Example #22
0
def handle_data():
    def cleancap(raw_cap):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_cap)
        tmp = cleantext.split('\n')
        cap = list()
        pre = ''
        for line in tmp:
            if line.replace(' ', '') and line != pre:
                if '-->' in line: cap.append('')
                else: pre = line
                cap.append(line)
        tmp = set()
        for idx in range(len(cap)):
            if '-->' in cap[idx] and (idx >= len(cap) - 2
                                      or '-->' in cap[idx + 2]):
                tmp.add(idx)
                tmp.add(idx + 1)
        final = list()
        for idx in range(len(cap)):
            if idx not in tmp: final.append(cap[idx])
        return '\n'.join(final)

    user_level = request.form['user_level']
    title = ''
    publish_date = ''
    text = request.form['text']
    if (text.startswith('http://www.youtube.com')
            or text.startswith('http://youtube.com')
            or text.startswith('http://youtu.be')
            or text.startswith('https://www.youtube.com')
            or text.startswith('https://youtube.com')
            or text.startswith('https://youtu.be')):
        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'skip_download': True,  # We just want to extract the info
            'outtmpl': 'download/target'  # file_path/target
        }
        file = ''
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([text])
            dirPath = "download"
            fileList = os.listdir(dirPath)
            if 'target.en.vtt' in fileList:
                file = cleancap(open('download/target.en.vtt').read())
            else:
                file = 'There is no english substitle in this video!'
            for fileName in fileList:
                if os.path.isfile(os.path.join(dirPath, fileName)):
                    os.remove(os.path.join(dirPath, fileName))
        v_id = text.split('=')[-1]
        content = [v_id, file]
        type_ = 'youtube'
        r = requests.get(text)
        if r.status_code < 400:
            title = BeautifulSoup(r.text, 'html.parser').find('title').text
            publish_date = BeautifulSoup(r.text, 'html.parser').find(
                'meta', itemprop="datePublished")['content']
    elif text.startswith('http://') or text.startswith('https://'):
        response = requests.get(text, headers=headers)
        doc = Document(remove_sometag(response.text))
        title = doc.short_title()
        publish_date = getPublishDate(response.content.decode('UTF-8'))
        content = doc.summary()
        type_ = 'url'
    else:
        content = text
        type_ = 'text'

    content = clean_content(content, type_)
    new,pure_text,vocab_dict = create_article(title, user_level, content, type_=='youtube', \
                         set(dictWord['V'].keys()), set(dictWord['N'].keys()), set(dictWord['ADJ'].keys()))
    store(pure_text, vocab_dict, user_level)
    return render_template('format.html', title=title, publish_date=publish_date, \
                           user_level=user_level, content=new)
Example #23
0
    def __init__(self, url, full_content=None, timeout=10):
        logger.info("HtmlContentExtractor.__init__: url=%s, full_content is None=%s", url, (full_content == None))

        # validate
        if not isinstance(url, str):
            raise RuntimeError("url not str.")
        if len(url) == 0:
            raise RuntimeException("len(url) == 0")

        if full_content is not None:
            if not isinstance(full_content, str):
                raise RuntimeError("full_content not str.")
            if len(full_content) == 0:
                raise ContentNoDataException(url)

        # Initialize instance variable
        self.url = url
        self.title = ""
        self.full_content = full_content
        self.content = ""
        self.simplified_content = ""
        self.summary_list = ""

        # Get html document
        if self.full_content is None:
            logger.debug("requests.get: start. url=%s", url)
            try:
                r = requests.get(url, timeout=timeout)
            except requests.exceptions.RequestException as ex:
                logger.warn("requests.get: fail. exception=%s", repr(ex))
                raise ContentRequestFailException(url)
            logger.debug("requests.get: end. status_code=%s, content_type=%s, len(full_content)=%s", r.status_code, r.headers["content-type"], len(r.text))

            logger.debug("request result check: start.")
            if r.status_code == 404:
                raise ContentNotFoundException(url)
            if len(r.text) == 0:
                raise ContentNoDataException(url)
            logger.debug("request result check: end.")

            logger.debug("get full_content: start.")
            self.full_content = r.text
            logger.debug("get full_content: end. len(full_content)=%s", len(self.full_content))
        else:
            logger.debug("full_content not None")

        # Analyze html document

        ## Get extracted content
        logger.debug("extract content: start.")
        doc = Document(self.full_content)
        self.content = doc.summary()
        logger.debug("extract content: end. len(content)=%s", len(self.content))

        ## Get title
        logger.debug("get title: start.")
        self.title = doc.short_title()
        logger.debug("get title: end. title=%s", self.title)

        ## Get simplified content
        logger.debug("content simplify: start.")
        markdown_content = pypandoc.convert_text(self.content, "markdown_github", format="html", extra_args=["--normalize", "--no-wrap"])
        self.simplified_content = pypandoc.convert_text(markdown_content, "html", format="markdown_github", extra_args=["--email-obfuscation=none"])
        logger.debug("content simplify: end. len(simplified_content)=%s", len(self.simplified_content))

        # Get summary
        logger.debug("summarize: start.")
        auto_abstractor = AutoAbstractor()
        abstractable_doc = AbstractableTopNRank()
        abstractable_doc.set_top_n(3)
        summary_list = auto_abstractor.summarize(self.simplified_content, abstractable_doc)["summarize_result"]
        self.summary_list = [pypandoc.convert_text(summary.strip(), "plain", format="html").strip() for summary in summary_list]
        logger.debug("summarize: end. len(summary_list)=%s", len(self.summary_list))
class TitleExtractor(object):
    def __init__(self, html):
        self._html = html
        self._title = ''
        self._doc = Document(html)

    def clean_title(self, title):
        spliters = [' - ', '–', '—', '-', '|', '::']
        for s in spliters:
            if s not in title:
                continue
            tts = title.split(s)
            if len(tts) < 2:
                continue
            title = tts[0]
            break
        return title

    def get_title_method1(self):
        self._title = self._doc.short_title()

    def get_title_method2(self):
        # 处理特殊的网站不规则的标题
        if not self._title:
            regex = TITLE_RE
            self._title = get_info(self._html, regex, fetch_one=True)

    def get_title_method3(self):
        g = Goose()
        article = g.extract(raw_html=self._html)
        self._title = article.title

    def get_title_method4(self):
        doc = lxml.html.fromstring(self._html)
        title = ''
        title_el = doc.xpath('//title')
        if title_el:
            title = title_el[0].text_content().strip()
        if len(title) < 7:
            tt = doc.xpath('//meta[@name="title"]')
            if tt:
                title = tt[0].get('content', '')
        if len(title) < 7:
            tt = doc.xpath(
                '//*[contains(@id, "title") or contains(@class, "title")]')
            if not tt:
                tt = doc.xpath(
                    '//*[contains(@id, "font01") or contains(@class, "font01")]'
                )
            for t in tt:
                ti = t.text_content().strip()
                if ti in title and len(ti) * 2 > len(title):
                    title = ti
                    break
                if len(ti) > 20: continue
                if len(ti) > len(title) or len(ti) > 7:
                    title = ti
        self._title = title

    def get_title(self):
        self.get_title_method1()
        if not self._title:
            self.get_title_method2()
        if not self._title:
            self.get_title_method3()
        self._title = self.clean_title(self._title)
        return self._title
import urllib.request
from readability import Document

req = urllib.request.Request(
    'https://en.wikipedia.org/wiki/%22Hello,_World!%22_program',
    headers={
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15'
    })

with urllib.request.urlopen(req) as f:
    urllib_content = f.read()
# print(urllib_content.decode("utf-8"))

doc = Document(urllib_content)

print(doc.title())
print(doc.short_title())

print(doc.summary())
Example #26
0
import requests
from readability import Document
from pprint import pprint

response = requests.get('https://laravel-news.com/announcing-building-a-chatbot-with-laravel-and-botman')

doc = Document(response.text)
# API methods:
# .title() -- full title
# .short_title() -- cleaned up title
# .content() -- full content
# .summary() -- cleaned up content
data = dict()
data['title'] = doc.title()
data['short_title'] = doc.short_title()
data['content'] = doc.content()
data['summary'] = doc.summary()


pprint( data )