Example #1
0
def get_text(url):
    response = get(url)
    doc = Document(response.text)
    title = doc.title()
    summary = doc.summary()
    body = bs4.BeautifulSoup(summary, features="lxml").get_text()
    return f"{title} : {body}"
Example #2
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Example #3
0
    def process_item(self, item: NoticeItem, spider: scrapy.Spider):
        if item and isinstance(item, NoticeItem):
            ''' Extract main content from html. '''
            def clean_p(s: str) -> str:
                return s.replace('\xa0', ' ').strip()

            def clean_all(s: str) -> str:
                s = self.SPACES_PATTERN.sub('\n', s)
                s = s.strip()
                return s

            """ Expel non-UTF8 characters. """
            content = item['content'].decode('utf-8', 'replace')

            item['author'] = _find_author(content)
            item['department'] = _find_department(content)
            item['publish_time'] = _find_publish_time(content)

            article = Document(content, handle_failures=None)
            page = etree.HTML(article.summary())
            paragraphs = [
                clean_p(p.xpath('string(.)')) for p in page.xpath('//p')
            ]
            item['content'] = clean_all('\n'.join(paragraphs))
            self.pg_pool.runInteraction(self.submit_item, item)
        else:
            return item
Example #4
0
 def POST(self):
     url = web.data().decode('utf8')
     response = requests.get(url)
     doc = Document(response.text)
     summary = doc.summary()
     web.header('Content-Type', 'application/text')
     return clean(capture(summary))
Example #5
0
def parse(html):
    doc = Document(html)
    title = doc.title()
    if title == u'[no-title]':
        title = u''
    content_html = doc.summary()
    content_html = content_html.replace(u'<html>', u'').replace(u'</html>', u'')\
        .replace(u'<body>', u'').replace(u'</body>', u'')

    clear_paths = [u'//script', u'//img', u'//a']
    body = clearDOM(html, clear_paths)

    match_list = findTimeStr(body)
    post_date = u''
    for match_item in match_list:
        if len(match_item) > len(post_date):
            post_date = match_item

    style_in_list = []
    style_need_replace = []

    content_item = {
        u'title': title,
        u'content_html': content_html,
        u'post_date': post_date,
        u'style_in_list': style_in_list,
        u'style_need_replace': style_need_replace
    }
    return content_item
Example #6
0
def parse_article(url: str) -> Dict:
    """
    Parses the HTML output for URL and grabs title and description
    :param url: The URL sent by the user
    :return: Dictionary containing HTML data
    """
    if is_url_blacklisted(url):
        return {}

    data = {"title": "", "description": "", "image": "", "body": ""}
    try:
        headers = {'user-agent': 'Bookie/app'}
        response = requests.get(https_upgrade(url), timeout=3, headers=headers)
    except Timeout:
        return data

    tree = html.fromstring(response.content)
    doc = Document(response.text)

    title = tree.xpath('//title/text()')
    description = tree.xpath('//meta[@name="description"]/@content')
    image = tree.xpath('//meta[@property="og:image"]/@content')
    body = doc.summary()

    if title:
        data["title"] = title[0]
    if description:
        data["description"] = description[0]
    if image:
        data["image"] = https_upgrade(image[0])
    if body:
        data["body"] = body

    return data
Example #7
0
    def test_correct_cleanup(self):
        sample = """
        <html>
            <body>
                <section>test section</section>
                <article class="">
<p>Lot of text here.</p>
                <div id="advertisement"><a href="link">Ad</a></div>
<p>More text is written here, and contains punctuation and dots.</p>
</article>
                <aside id="comment1"/>
                <div id="comment2">
                    <a href="asd">spam</a>
                    <a href="asd">spam</a>
                    <a href="asd">spam</a>
                </div>
                <div id="comment3"/>
                <aside id="comment4">A small comment.</aside>
                <div id="comment5"><p>The comment is also helpful, but it's
                    still not the correct item to be extracted.</p>
                    <p>It's even longer than the article itself!"</p></div>
            </body>
        </html>
        """
        doc = Document(sample)
        s = doc.summary()
        # print(s)
        assert "punctuation" in s
        assert not "comment" in s
        assert not "aside" in s
Example #8
0
 def parse(self, url: str) -> dict:
     """Download the article and parse it"""
     r = get(url, headers=HTML_HEADERS)
     doc = Document(r.text, url=url)
     html = doc.summary(html_partial=True)
     clean_html = self.fix_blockquotes(html)
     return {"content": clean_html}
def analyze(request):
    'API text analyze view'
    if request.method == 'POST':
        text = request.body.decode('utf-8')
        try:
            text = json.loads(text)['text']
        except ValueError:
            # catch POST form as well
            for key in request.POST.dict().keys():
                text = key

        if settings.ALLOW_URL_IMPORTS and text.startswith(('http://', 'https://', 'www')):
            page = requests.get(text)
            doc = Document(page.text)
            soup = BeautifulSoup(doc.summary())
            text = soup.get_text()
            title = doc.title().strip()
            text = '{0}.\n{1}'.format(title, text)

        if not text:
            response = JsonResponse(
                {'status': 'false', 'message': 'need some text here!'})
            response.status_code = 400
            return response

        # add some limit here
        text = text[:200000]
        ret = {}
        ret = analyze_text(text)
        return JsonResponse(ret)
    else:
        ret = {'methods_allowed': 'POST'}
        return JsonResponse(ret)
Example #10
0
	def parse_story(self, response):
		doc = Document(response.text)
		story = NewsItem()
		story['url'] = response.url
		story['headline'] = doc.short_title()
		story['body'] = doc.summary()
		yield story
    def getAutoDetail(cls,
                      contentPageNumber,
                      html,
                      enableDownloadImage=False,
                      enableSnapshot=False):
        autoDetail = {}
        try:

            doc = Document(html)
            # response.
            if contentPageNumber <= 1:
                autoDetail["title"] = ArticleUtils.cleanHeadTitle(doc.title())
                # autoDetail["publishAt"] = TimeUtils.get_conent_time(html)
                # autoDetail["html"] = html
            contentSnapshot = doc.summary()
            if StringUtils.isNotEmpty(
                    ArticleUtils.removeAllTag(contentSnapshot)):
                if enableSnapshot:
                    autoDetail["contentSnapshot"] = contentSnapshot.replace(
                        "<html>",
                        "").replace("</html>",
                                    "").replace("<body>",
                                                "").replace("</body>", "")
                autoDetail["content"] = ArticleUtils.removeTag4Content(
                    contentSnapshot)
                if enableDownloadImage:
                    autoDetail[
                        "contentImages"] = ArticleUtils.get_content_image_urls(
                            contentSnapshot, response.url)
        except Exception as e:
            return autoDetail
        return autoDetail
Example #12
0
def main(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()

    document = Document(article.html)
    summary = document.summary()
    content = document.content()

    title = get_title(article)
    text = get_text(article)
    entities = get_entities(text)
    phrases = get_phrases(text, entities)
    keywords = get_keywords(article, phrases)
    urls_primary = get_urls(summary, url, [])
    urls_secondary = get_urls(content, url, urls_primary)

    return {
        'title': title,
        'text': text,
        'entities': entities,
        'keywords': keywords,
        'phrases': phrases,
        'urls': {
            'primary': urls_primary,
            'secondary': urls_secondary,
        },
    }
Example #13
0
 def getAutoDetail(cls,
                   response,
                   enableDownloadImage=False,
                   enableSnapshot=False,
                   isFirstPage=True):
     autoDetail = {}
     try:
         html = "".join(response.xpath("//html").extract())
         doc = Document(html)
         if isFirstPage:
             autoDetail["title"] = doc.title()
             autoDetail["publishAt"] = TimeUtils.get_conent_time(html)
         contentSnapshot = doc.summary()
         if enableSnapshot:
             autoDetail["contentSnapshot"] = contentSnapshot.replace(
                 "<html>",
                 "").replace("</html>",
                             "").replace("<body>",
                                         "").replace("</body>", "")
         autoDetail["content"] = ArticleUtils.removeTag4Content(
             contentSnapshot)
         if enableDownloadImage:
             autoDetail[
                 "contentImages"] = ArticleUtils.get_content_image_urls(
                     contentSnapshot, response.url)
     except Exception as e:
         return autoDetail
     return autoDetail
Example #14
0
    def parse(self, response):
        # response.body(str)与reponse.text(unicode)的区别使用
        # 注意页面的编码问题
        key = response.url
        code = get_code(response.body)
        text = response.body.decode(code).encode('utf-8')
        name, college = pro_lists[key][0], pro_lists[key][1]

        # 存储原始格式的html文件
        homepage_name = unicode(name+ '_' + college, 'utf-8')
        filename = '%s.html' % homepage_name
        with open(os.path.join(source_html, filename), 'w') as f:
            f.write(text)
        self.log('Saved origin file %s' % filename)

        # 使用readability包的初步处理,获取较为干净的页面
        doc = Document(text)
        clean_text, title = self.get_cleanpage(doc)
        with open(os.path.join(clean_html, filename), 'w') as f:
            f.write(clean_text)
        self.log('Saved clean file %s' % filename)

        # 使用htmlpaser,抽取所有的文字(配合readablity来完成基本的抽取)
        data = htmlpaser.FilterTag.strip_tags(text)
        data1 = beatifulsoup.get_html_content(text)
        html_content.write(homepage_name + ': "' + data + '"\n')
Example #15
0
    def process_item(self, article, spider):

        doc = Document(article['text'])
        article['text'] = strip_tags(doc.summary())
        article['hash'] = hashlib.sha256(article['url']).hexdigest()

        return article
Example #16
0
def get_article(url):
    try:
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
        response = requests.get(
            url, headers=headers, timeout=4
        )  # can throw BaseException if the server does not respond, or TooManyRedirects
    except BaseException:
        return ''

    readability_doc = Document(response.text)

    # doc.summary() is not really a summary just the main part of the
    # website's content
    html_content = readability_doc.summary()
    title = readability_doc.title()
    soup = BeautifulSoup(html_content, 'html.parser')
    # Replace all excesive whitespace and new line
    content = re.sub(r"(\s{2,})|(\n{2,})",
                     "\n",
                     soup.get_text(),
                     flags=re.UNICODE)
    if " - " in title:
        title = re.sub(r" - [\s\w]+$", "", title)

    return content, title
Example #17
0
 def handleMatch(self, match, full_text):
     """Find a import statement in the text, if it's there we extract the
     import type and its associated URL."""
     import_type, url = match.group(1, 2)
     #TODO: Refactor this to use methods instead of long
     # if conditional
     if import_type == "html":
         source = request.urlopen(url)
         parser = HTMLBodyParser()
         #TODO: Handle non-UTF-8
         parser.feed(source.read().decode("UTF-8"))
         source.close()
         return parser.document, match.start(), match.end()
     if import_type == "html_s":
         source = request.urlopen(url)
         #TODO: Handle non-UTF-8
         source_document = Document(source.read().decode("UTF-8"))
         parser = HTMLBodyParser()
         parser.feed(source_document.summary())
         source.close()
         return parser.document, match.start(), match.end()
     if import_type == "csv":
         #TODO: Handle non-UTF-8
         source = request.urlopen(url)
         csv_text = source.read().decode("UTF-8")
         table = etree.Element("table")
         for row in csv.reader(csv_text.splitlines()):
             table_row = etree.SubElement(table, "tr")
             for data in row:
                 td = etree.SubElement(table_row, "td")
                 td.text = data
         source.close()
         return table, match.start(), match.end()
Example #18
0
def text(url):
    if 'http' not in url:
        url = 'http://' + url
    page = get(url).text
    doc = Document(page).summary()
    text = BeautifulSoup(doc).get_text()
    return text.strip()
Example #19
0
 def test_wrong_link_issue_49(self):
     """We shouldn't break on bad HTML."""
     sample = load_sample(
         "the-hurricane-rubin-carter-denzel-washington.html")
     doc = Document(sample)
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="content__article-body ', res[0:39])
Example #20
0
def getHtmlToClipboard(dest_url):
    html_headers = {
        "Host": "note.youdao.com",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        #"Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection": "keep-alive"
    }
    try:
        html_host = urllib.parse.urlparse(dest_url)
        html_headers["Host"] = html_host.netloc
        # urllib不会自动解压缩
        #html_request = urllib.request.Request(url, headers=html_headers)
        #html_respone = urllib.request.urlopen(html_request)
        #html_respone = urllib.request.urlopen(dest_url)
        #print(html_respone.read().decode("utf-8"))
        #html_text = html_respone.read().decode("utf-8")

        html_respone = requests.get(dest_url, headers=html_headers)
        #print(html_respone.text)
        html_text = html_respone.text

        html_doc = Document(html_text)
        HtmlClipboard.PutHtml(html_doc.summary())
        if HtmlClipboard.HasHtml():
            print('there is HTML!!')
            dirty_HTML = HtmlClipboard.GetHtml()

    except Exception:
        return False
    else:
        return True
Example #21
0
def extract():
	
	url = request.form['site']

	response = requests.get(url)
	doc = Document(response.text)

	parser = html2text.HTML2Text()
	parser.ignore_links = True
	parser.ignore_images = True
	parser.ignore_emphasis = True
	parser.ignore_anchors = True
	parser.ignore_tables = True

	title = doc.title()
	title = re.sub(r' *- [-a-zA-Z0-9 @:%._\+~#=]{1,256}', '', title)														#	<---- this crops everything after -[space]
	title = re.sub(r' *- *[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', title)	#	<---- this is for websites only
	article = parser.handle(str(doc.summary()))
	article = fix_article(article)
	article = article.split("<br/>")

	if url != '':
		insert_in_db(analytics_client, container, url, title, article)
		return flask.render_template('index.html', title=title, data=article)
	else:
		error_msg = "Enter a valid URL"
		return flask.render_template('index.html', error=error_msg)
Example #22
0
def run_readability(htmlstring):
    '''try with the Python3 port of readability.js'''
    try:
        doc = Document(htmlstring)
        return doc.summary()  # sanitize(doc.summary())
    except Exception as err:
        print('Exception:', err)
        return ''
 def test_best_elem_is_root_and_passing(self):
     sample = ('<html class="article" id="body">'
               '   <body>'
               '       <p>1234567890123456789012345</p>'
               '   </body>'
               '</html>')
     doc = Document(sample)
     doc.summary()
Example #24
0
 def process_one(self, content):
     try:
         doc = Document(content)
         return ContentResult(doc.title(), doc.summary())
     except Exception as e:
         logger.error(
             f"Readability failed on {content.title} with error {e}")
         return ContentResult('', '')
 def test_si_sample(self):
     """Using the si sample, load article with only opening body element"""
     sample = load_sample('si-game.sample.html')
     doc = Document(
         sample,
         url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
     res = doc.summary()
     self.assertEqual('<html><body><div><div class', res[0:27])
Example #26
0
 def get_entry_content(self, entry, rss_entry):
     try:
         r = requests.get(rss_entry.link)
         assert r.status_code == 200
     except Exception:
         return '<div class="alert alert-warning" role="alert">Unable to get the full article with readability, because the page didn\'t load :(</div>\n' + super(
         ).get_entry_content(entry, rss_entry)
     return Document(r.text).summary()
Example #27
0
def get_main_text(url):
    response = requests.get(url)
    doc = Document(response.text)
    text = doc.summary()
    re_form = r'(?<=<p>)(.*?)(?=</p>)'
    re_text = " ".join(re.findall(re_form, text))
    print(re_text)
    return re_text
Example #28
0
    def test_many_repeated_spaces(self):
        long_space = " " * 1000000
        sample = "<html><body><p>foo" + long_space + "</p></body></html>"

        doc = Document(sample)
        s = doc.summary()

        assert "foo" in s
Example #29
0
def get_text_from_url(url):
    '''
    Get most significant text returned from URL.
    '''
    response = requests.get(url)
    doc = Document(response.text)
    text = dataset.preprocess_html(doc.summary())
    return text
    def test_many_repeated_spaces(self):
        long_space = ' ' * 1000000
        sample = '<html><body><p>foo' + long_space + '</p></body></html>'

        doc = Document(sample)
        s = doc.summary()

        assert 'foo' in s