def test_best_elem_is_root_and_passing(self):
     sample = (
         '<html class="article" id="body">'
         '   <body>'
         '       <p>1234567890123456789012345</p>'
         '   </body>'
         '</html>'
     )
     doc = Document(sample)
     doc.summary()
 def test_si_sample(self):
     """Using the si sample, load article with only opening body element"""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample)
     doc.parse(["summary"])
     res = doc.summary()
     self.assertEqual('<html><body><h1>Tigers-Roya', res[0:27])
Beispiel #3
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
Beispiel #4
0
 def process_item(self, article, spider):
     
     doc = Document(article['text'])
     article['text'] = strip_tags(doc.summary())
     article['hash'] = hashlib.sha256(article['url']).hexdigest()
     
     return article
Beispiel #5
0
 def get(self):
     url = self.get_argument("url", None)
     # https://www.ifanr.com/1080409
     doc = Webcache.find_one({'url': url}, {'_id': 0})
     if doc:
         self.res = dict(doc)
         return self.write_json()
     try:
         sessions = requests.session()
         sessions.headers[
             'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
         response = sessions.get(url)
         # response.encoding = 'utf-8'  # TODO
         response.encoding = get_charset(response)
         doc = Document(response.text)
         title = doc.title()
         summary = doc.summary()
         markdown = html2text.html2text(summary)
         markdown = markdown.replace('-\n', '-')
         markdown = markdown.strip()
         res = {}
         res['url'] = url
         res['title'] = title
         res['markdown'] = markdown
         if title and markdown:
             webcache = Webcache
             webcache.new(res)
             self.res = res
         self.write_json()
     except Exception as e:
         print(e)
 def test_lxml_obj_result(self):
     """Feed Document with an lxml obj instead of an html string. Expect an lxml response"""
     utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
     sample = lxml.html.document_fromstring(load_sample('nyt-article-video.sample.html'), parser=utf8_parser)
     doc = Document(sample, url='http://nytimes.com/')
     res = doc.summary()
     self.assertFalse(isinstance(res, basestring))
    def test_correct_cleanup(self):
        sample = """
        <html>
            <body>
                <section>test section</section>
                <article class="">
<p>Lot of text here.</p>
                <div id="advertisement"><a href="link">Ad</a></div>
<p>More text is written here, and contains punctuation and dots.</p>
</article>
                <aside id="comment1"/>
                <div id="comment2">
                    <a href="asd">spam</a>
                    <a href="asd">spam</a>
                    <a href="asd">spam</a>
                </div>
                <div id="comment3"/>
                <aside id="comment4">A small comment.</aside>
                <div id="comment5"><p>The comment is also helpful, but it's
                    still not the correct item to be extracted.</p>
                    <p>It's even longer than the article itself!"</p></div>
            </body>
        </html>
        """
        doc = Document(sample)
        s = doc.summary()
        #print(s)
        assert('punctuation' in s)
        assert(not 'comment' in s)
        assert(not 'aside' in s)
 def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample)
     doc.parse(["summary"], html_partial=True)
     res = doc.summary()
     self.assertEqual('<div><h1>Tigers-R', res[0:17])
    def test_many_repeated_spaces(self):
        long_space = ' ' * 1000000
        sample = '<html><body><p>foo' + long_space + '</p></body></html>'

        doc = Document(sample)
        s = doc.summary()

        assert 'foo' in s
 def test_si_sample(self):
     """Using the si sample, load article with only opening body element"""
     sample = load_sample('si-game.sample.html')
     doc = Document(
         sample,
         url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
     res = doc.summary()
     self.assertEqual('<html><body><div><div class', res[0:27])
 def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample("si-game.sample.html")
     doc = Document(
         sample, url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html"
     )
     res = doc.summary(enclose_with_html_tag=True)
     self.assertEqual('<div><div class="', res[0:17])
Beispiel #12
0
 def get(self):
   urls = self.get_query_arguments('url')
   if urls and len(urls) == 1:
     url = urls[0]
     doc = Document(requests.get(url).text)
     self.write(smartypants(doc.summary()))
     self.write(STYLE)
   else:
     self.write("Please provide ?url=[your-url]")
Beispiel #13
0
    def transform(self, row, chan):
        row['response'] = resolve_future(row['response'])

        doc = Document(row['response'].content)

        row['title'] = doc.title()
        summary = doc.summary()
        row['text'] = html2text(summary, bodywidth=160).replace('****', '').strip()

        yield row
Beispiel #14
0
def extract_article(url, ip):
    """Extracts the article using readability"""
    title, summary = None, None
    response = get_url(url, ip)
    if response.status_code == 200:
        doc = Document(response.content)
        summary = unicode(doc.summary())
        title = unicode(doc.title())
        return title, summary
    else:
        return None
Beispiel #15
0
def extract_article(html, title=None):
    """
    Wraps around readability.Document and returns the articles
    title and content.
    """
    doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS)
    doc_title = doc.short_title()
    # invoke the summary method to invoke readability's magic
    doc.summary(html_partial=True)
    # obtain the article as HtmlElement tree:
    html_tree = doc.html
    # clean up the article html:
    clean_html = cleanup(html_tree, doc_title)
    # check if the outer element is a tag from negative_keywords
    if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS):
        bad_attr = True
    else:
        bad_attr = False
    if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr:
        # if so, redo extraction with min_text_length set to 0
        doc = Document(html,
                       negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS,
                       min_text_length=0)
        doc_title = doc.short_title()
        # invoke the summary method to invoke readability's magic
        doc.summary(html_partial=True)
        # obtain the article as HtmlElement tree:
        html_tree = doc.html
        # clean up the article html:
        clean_html = cleanup(html_tree, doc_title)
    content = elem_content_to_string(clean_html)
    if title:
        # if the extracted title is not a subset of given title, use
        # the given title (b/c we assume this is more accurate, but
        # maybe with some unneccessary boilerplate).
        if not doc_title in title or doc_title == '':
            doc_title = title
    return doc_title, content
Beispiel #16
0
def extract_content_texts(name):
    article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles')
    json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles')
    mkdir_p(json_archive)
    for html in glob.glob(article_archive+'/*.html'):
        fname = os.path.basename(html)+'.json'
        savepath = os.path.join(json_archive, fname)
        if os.path.exists(savepath):
            logging.info('Skipping existing json data: {0}'.format(savepath))
            continue
        data = {}
        with open(html, 'r') as myfile:
            doc = Document(myfile.read())
            data['title'] = doc.title()
            data['content'] = doc.content()
            data['summary'] = doc.summary()
            with open(savepath, 'w') as saving:
                json.dump(data, saving)
Beispiel #17
0
    def preliminary_parse(self):
        if(not self.is_downloaded):
            raise Exception("not downloaded")
        try:
            d = Document(self.html)
            self._readability_title = d.short_title()
            self._readability_text = d.summary()
            logging.debug(u"readability title: {0}".format(repr(self._readability_title)))
            logging.debug(u"readability text: {0}".format(repr(self._readability_text)))
            if(self._readability_title and self._readability_text):
                return
        except Exception as e:
            logging.warning("error while doing readability parse: {0}".format(str(e)))

        logging.debug("falling back to newspaper parse")
        self.newspaper_article.parse()
        logging.debug(u"newspaper title: {0}".format(repr(self._newspaper_title)))
        logging.debug(u"newspaper text: {0}".format(repr(self._newspaper_text)))
Beispiel #18
0
 def get(self):
     sharetype = self.get_argument("sharetype", "goodlink")
     link = self.get_argument("link", '')
     user_id = self.current_user["user_id"]
     assert link
     url = link
     doc = Webcache.find_one({'url': url}, {'_id': 0})
     if not doc:
         sessions = requests.session()
         sessions.headers[
             'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
         response = sessions.get(url)
         # response.encoding = 'utf-8'  # TODO
         response.encoding = get_charset(response)
         logger.info('response.encoding {}'.format(response.encoding))
         doc = Document(response.text)
         doc_title = doc.title()
         summary = doc.summary()
         _markdown = html2text.html2text(summary)
         _markdown = _markdown.replace('-\n', '-').strip()
         res_webcache = {}
         res_webcache['url'] = url
         res_webcache['title'] = doc_title
         res_webcache['markdown'] = _markdown
         if _markdown:
             webcache = Webcache
             webcache.new(res_webcache)
     else:
         logger.info('already')
         doc_title = doc.title
     res = {
         'title': doc_title,
         'sharetype': sharetype,
         'link': link,
     }
     share = Share
     res['user_id'] = user_id
     share = share.new(res)
     user = User.by_sid(user_id)
     user.user_leaf += 10
     user.save()
     self.redirect("/share/" + str(share.id))
Beispiel #19
0
 def complement(self):
     for entry in self.entries:
         try:
             response = requests.get(entry.url, timeout=10)
         except requests.RequestException as excp:
             logger.warn('Exception requesting article %s: %s',
                         entry.url, excp.message)
             continue
         document = Document(response.content, url=response.url)
         # Image extraction first
         document._html()  # Trigger parsing
         images = document.html.xpath(
             '//meta[@property="og:image"]/@content')
         images += document.html.xpath(
             '//meta[@name="twitter:image:src"]/@content')
         # Content extraction second
         entry.url = response.url
         entry.image = (images or [''])[0]
         entry.title = document.short_title()
         entry.content = document.summary()
         yield entry
    def extract(self, item):
        """Creates an readability document and returns an ArticleCandidate containing article title and text.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """

        doc = Document(deepcopy(item['spider_response'].body))
        description = doc.summary()

        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name
        article_candidate.title = doc.short_title()
        article_candidate.description = description
        article_candidate.text = self._text(item)
        article_candidate.topimage = self._topimage(item)
        article_candidate.author = self._author(item)
        article_candidate.publish_date = self._publish_date(item)
        article_candidate.language = self._language(item)

        return article_candidate
Beispiel #21
0
def parse_web_page(text):
    """
    Generic wep page parser with readability.
    Used as a fallback.

    :param text: unicode text
    :return: title, article
    :raise ParserException:
    """
    try:
        from readability import Document
        from readability.readability import Unparseable
    except ImportError:
        raise ParserException('readability is not installed')

    if not text:
        raise ParserException('No decoded text available, aborting!')
    try:
        doc = Document(text)
    except Unparseable as e:
        raise ParserException(e.message)
    else:
        return doc.short_title(), doc.summary(True)
    return TAG_RE.sub('', text)

superbowl_media = get_media_data('tweets_#superbowl')
#superbowl_media.to_pickle('superbowl_media')
#superbowl_media = pd.read_pickle('superbowl_media')

target_day = superbowl_media[superbowl_media['day']==1]
target_time = target_day[target_day['hour']==19]
target = target_time[target_time['name']=='YahooSports']
url = target.iloc[0]['media'][0]['expanded_url']

import requests
from readability import Document
response = requests.get(url)
doc = Document(response.text)
print(remove_tags(doc.summary()))
t = open('article.txt','w')
t.write(remove_tags(doc.summary()))
t.close()

from sumy.parsers.plaintext import PlaintextParser 
from sumy.nlp.tokenizers import Tokenizer 
from sumy.summarizers.lex_rank import LexRankSummarizer

file = "article.txt"
parser = PlaintextParser.from_file(file, Tokenizer("english"))
summarizer = LexRankSummarizer()
summary = summarizer(parser.document, 5) 
print(doc.title())
for sentence in summary:
    print(sentence)
def predict():
    
    
    df = pd.read_csv('../webapp/revised_rating_data')
    lemmatized = df['lemmatized'].tolist()
    X_class = df['lemmatized']
    y_class = df['point_non-bad']
    X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.25, random_state=42)
    tvec_class = TfidfVectorizer(stop_words='english')
    tvec_class.fit(X_train_class.values.astype('U'))
    X_train_class = tvec_class.transform(X_train_class.values.astype('U'))
    lr_class = LogisticRegression()
    lr_class.fit(X_train_class, y_train_class)
    
    data = pd.read_csv('../webapp/revised_data')   
    X = df['lemmatized']
    y = data['topics']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
    tvec = TfidfVectorizer(stop_words='english')
    tvec.fit(X_train.values.astype('U'))
    X_train = tvec.transform(X_train.values.astype('U'))
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
      
    if request.method == 'POST':
        message = request.form['message']
        data = message
        response = requests.get(data)
        doc = Document(response.text)
        full_text = doc.summary(html_partial=True)
        full_text = full_text.replace(r"\n", " ")
        full_text = full_text.replace(r"\t", " ")
        full_text = full_text.replace(r"/", " ")
        full_text = full_text.replace(r"<p>", " ")
        full_text = normalize('NFKD', full_text)
        full_text = full_text.split('< p>')
        TAG_RE = re.compile(r'<[^>][^>]+>')
        
        def remove_tags(text):
            return TAG_RE.sub(' ', text)
        
        term_text = list(map(remove_tags, full_text))
        term_frame = pd.DataFrame(np.array(term_text), columns = ['quoteText'])
        
        def text_to_words(titletext):
            letters_only = re.sub("[^a-zA-Z]", " ", titletext)
            words = letters_only.lower().split()
            lemmatizer = WordNetLemmatizer()
            tokens_lem = [lemmatizer.lemmatize(i) for i in words]
            return(' '.join(tokens_lem))
        
        lemm_text=[]
        for text in term_frame['quoteText']:
            lemm_text.append(text_to_words(text))
            
           
        vect_class = tvec_class.transform(lemm_text).toarray()
        prediction_class = pd.DataFrame(lr_class.predict_proba(vect_class), columns=['warning','non-warning'])
        
        vect = tvec.transform(lemm_text).toarray()
        prediction = pd.DataFrame(lr.predict(vect), columns =['pred_topic'])
        
        results = pd.merge(term_frame, prediction, left_index=True, right_index=True)
        results = pd.merge(results, prediction_class, left_index=True, right_index=True)
        results = results.sort_values('non-warning')
        my_prediction = results["warning"].mean()
        #results = results[results['warning'] > 0.3 ]
        topics = []
        topicIndx = []
        topicContent=[]
        for i in results['pred_topic']:
            if i not in topics:
                topics.append(i)
        for i in topics:
            topic = results[results['pred_topic'] == i]
            count = 0
            for j in topic.index:
                count +=1   
                topicContent.append(topic.quoteText[j])
                topicIndx.append(i)
        df1 = pd.DataFrame({'topic':topicIndx,
                       'content':topicContent})
        df1 = df1.replace('\n','', regex=True)
        df1 = df1.replace('<i>','', regex=True)
        df1 = df1.replace('&#13;','', regex=True)
        return render_template('result-Copy1.html', prediction = my_prediction, df1 = df1.to_html())
Beispiel #24
0
 def test_utf8_kanji(self):
     """Using the UTF-8 kanji sample, load article which is written in kanji"""
     sample = load_sample("utf-8-kanji.sample.html")
     doc = Document(sample)
     res = doc.summary()
Beispiel #25
0
 def format_html(cls, row, media_path, content=None, custom_html=False):
     media_dir, file_path = os.path.split(media_path)
     resource_dir = os.path.join(settings.ARCHIVE_LOCATION, 'resources', str(row.id))
     resource_link = '/{}/{}/{}/{}'.format(row.usr.username, row.directory, str(row.id), 'resources')
     if not os.path.exists(resource_dir):
         os.makedirs(resource_dir)
     if not content:
         content = ""
         with open(media_path, encoding='utf-8', mode='r') as fd:
             content = fd.read()
     soup = BeautifulSoup(content, 'lxml')
     for script in soup.find_all('script'):
         script.decompose()
     url_path = row.url
     ourl = urlparse(url_path)
     ourld = ourl.scheme + '://' + ourl.netloc
     link_list = soup.find_all(['a', 'link', 'img'])
     for link in link_list:
         if link.name == 'img':
             lnk = link.get('src', '')
         else:
             lnk = link.get('href', '')
         if lnk and lnk != '#':
             if link.name == 'img' or (link.name == 'link' and '.css' in lnk):
                 lnk = dbxs.format_link(lnk, url_path)
                 lnk_bytes = bytes(lnk, 'utf-8')
                 h = hashlib.sha256(lnk_bytes)
                 lnk_hash = h.hexdigest()
                 if link.name == 'img':
                     link['src'] = resource_link + '/' + lnk_hash
                     if custom_html:
                         link['class'] = 'card-img-top'
                 else:
                     lnk_hash = lnk_hash + '.css'
                     link['href'] = resource_link + '/' + lnk_hash
                 file_image = os.path.join(resource_dir, lnk_hash)
                 if not os.path.exists(file_image):
                     cls.vnt_noblock.get(lnk, out=file_image)
                     logger.info('getting file: {}, out: {}'.format(lnk, file_image))
             elif lnk.startswith('http'):
                 pass
             else:
                 nlnk = dbxs.format_link(lnk, url_path)
                 if link.name == 'img':
                     link['src'] = nlnk
                     if custom_html:
                         link['class'] = 'card-img-top'
                 else:
                     link['href'] = nlnk
     if custom_html:
         ndata = soup.prettify()
         if soup.title:
             title = soup.title.text
         else:
             title = row.url.rsplit('/')[-1]
         data = Document(ndata)
         data_sum = data.summary()
         if data_sum:
             nsoup = BeautifulSoup(data_sum, 'lxml')
             if nsoup.text.strip():
                 data = cls.custom_template(title, nsoup.prettify(), row)
             else:
                 data = cls.custom_soup(ndata, title, row)
         else:
             data = cls.custom_soup(ndata, title, row)
     else:
         data = soup.prettify()
     return bytes(data, 'utf-8')
Beispiel #26
0
def selectalgo(search_name, _PATH):
    jenableparallel = True
    try:
        jieba.enable_parallel(2)
    except:
        jenableparallel = False
        print("This env can't enable jieba parallel")
    link = "https://zh.wikipedia.org/wiki/" + search_name
    site = requests.get(link)
    text = BeautifulSoup(site.content, "html.parser")
    wikiTitle = text.find(id="firstHeading").getText()
    text = text.find(id="mw-content-text").extract()
    decolist = [
        "hatnote", "infobox", "navbox", "vertical-navbox", "toc",
        "mw-editsection", "reference", "plainlist", "plainlists",
        "references-column-width", "refbegin"
    ]  # decompose key word
    for deco in decolist:
        for s in text.find_all(class_=deco):
            s.decompose()
    for s in text.find_all("sup"):
        text.sup.decompose()
    if (text.find(id="noarticletext")):
        print("noarticletext")
        return "noarticletext", None
    selectpos = ["l", "n", "nr", "v", "vn", "eng"]  # select pos
    tags = jieba.analyse.extract_tags(OpenCC('tw2sp').convert(text.getText()),
                                      topK=20,
                                      withWeight=True,
                                      allowPOS=(selectpos))
    bantag = ["編輯", "條目"]  # ban wiki tag
    taglist = Taglist()
    # tfidffile = open(_PATH+search_name+"textsegmentation.txt", "w")
    for tag, wei in tags:
        if OpenCC('s2twp').convert(tag) in bantag or OpenCC('s2twp').convert(
                tag) in search_name:
            # tags.remove((tag, wei))
            continue
        print(tag, wei)
        taglist.append(Tag(tag, wei))
        # tfidffile.write("{} {}\n".format(tag, wei))
    # tfidffile.close()
    header = {
        "User-Agent":
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13"
    }
    search_results = google.search(search_name)
    banword = [
        "ppt", "slide", "pdf", "news", "tv", "facebook.com", "平台", "平臺",
        "books.com", "course", "課程", "偽基", "youtube.com", "cw.com",
        "www.104.com", "udn.com", "KKTIX", "pcschool.com"
    ]
    # banword = []
    selectsite = []
    opcc = OpenCC('tw2sp')
    for i, res in enumerate(search_results):
        print(res.name, "{}/{}".format(i + 1, len(search_results)))
        print(res.link)
        banflag = False
        for bw in banword:
            if bw in res.name or bw in res.link:
                print("<{}>".format(bw))
                banflag = True
                break
        if banflag:
            continue
        try:
            response = requests.get(res.link, headers=header)
        except:
            print("some thing error")
        else:
            if "wikipedia" in res.link and False:
                print("iswiki")
                soup = text
            else:
                doc = Document(response.text)
                newhtml = doc.summary()
                converted = opcc.convert(newhtml)
                soup = BeautifulSoup(converted, "html.parser")

            words = jbps.cut(soup.get_text())
            # record = []
            # record.append(res.name+"\n")
            # record.append(res.link+"\n")
            # record.append(res.description+"\n")
            score = 0
            tagset = set()
            for word, _ in words:
                #                print(word)
                index = taglist.isInName(word)
                if index >= 0 and not index in tagset:
                    # record.append("%s %f\n" % (word, taglist[index].weight))
                    score += taglist[index].weight
                    tagset.add(index)


#            print(res.name, score)
            if score > 0:
                webname = ""
                offset = 7
                if res.link[offset] == '/':
                    offset += 1
                for c in res.link[offset:]:
                    if c != '/':
                        webname += c
                    else:
                        break
                print(webname)
                selectsite.append(
                    Selected(res.name, webname, res.link, score,
                             res.description, soup))
                # record.append(str(score))
                # with open(_PATH+"score/{}_{:.2f}.txt".format(webname, score), "w") as file:
                #     file.writelines(record)
    if jenableparallel:
        jieba.enable_parallel()
    return wikiTitle, sorted(selectsite, key=lambda s: s.score,
                             reverse=True)[:5]
Beispiel #27
0
    if not request.headers["content-type"][:9] in ["text/html", "text/plain"]:
        return False

    return True


def get_site_content(link):
    """Try and extract site content from url"""
    rv = ""

    try:
        r = requests.get(link, timeout=15.0)
    except requests.exceptions.RequestException, e:
        logger.warning("Failed loading URL '{}': {}".format(link, e))
    else:
        if valid_request(r):
            # extract the  (most likely) main content
            doc = Document(r.text, url=link)
            content = doc.summary(html_partial=True)
            rv = remove_html(content)
        else:
            logger.info("Invalid request {} for url '{}'".format(r, link))

    return rv


def repeated_func_schedule(time, func):
    spawn_later(0, func)
    spawn_later(time, repeated_func_schedule, time, func)
Beispiel #28
0
def html_read_to_text(html):
    doc = Document(html)
    print(doc.title())
    print(doc.summary())
Beispiel #29
0
 def clean_content((url, content)):
     try:
         doc = Document(content)
         yield url, doc.summary()
     except Unparseable:
         pass
Beispiel #30
0
def clean_html(epub, epub_path, source_code, url,
               file_idx):  # activated from fetch_page
    blacklist = ['script', 'style', 'dd', 'em', 'text', 'blockquote']
    graylist = ['div', 'h1', 'h2', 'h3', 'h4', 'h5', 'span']
    doc = Document(source_code.text)
    # or: DefaultExtractor ArticleExtractor ArticleSentencesExtractor KeepEverythingExtractor
    # NumWordsRulesExtractor CanolaExtractor KeepEverythingWithMinKWordsExtractor LargestContentExtractor
    # extractor = Extractor(extractor='KeepEverythingExtractor', url=url)
    # extracted_html = extractor.getHTML()
    # print (source_code.text)
    base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
    # soup = BeautifulSoup(doc.summary(), "html.parser")
    soup = BeautifulSoup(doc.summary(), "lxml")
    # print(soup)
    # soup = BeautifulSoup(extracted_html, "html.parser")
    for tag in soup.findAll():
        del tag['srcset']
        del tag['align']
        del tag['data-file-height']
        del tag['data-file-width']
        del tag['role']
        id = str(tag.get('id'))
        ch = id.find(':')
        if ch > -1:
            id = id[:ch] + id[ch + 1:]
            tag['id'] = id
            # print(': '+id)
        ch = id.find(',')
        if ch > -1:
            id = id[:ch] + id[ch + 1:]
            tag['id'] = id
            # print(', '+id)
        ch = id.find('.')
        if ch > -1:
            id = id[:ch] + id[ch + 1:]
            tag['id'] = id
            # print('. '+id)
        if tag.name.lower() in blacklist:
            # blacklisted tags are removed in their entirety
            tag.extract()
        elif tag.name.lower() in graylist:
            tag.attrs = []
            # del tag['class']
    for tag in soup.findAll(
            'a'):  # make all external links absolute and complete
        href = str(tag.get('href'))
        if not href:
            if href.startswith('http'):
                pass
            elif href.startswith('//'):
                href = 'http:' + href
            elif href.startswith('/'):
                href = base_url + href
            elif href.startswith('#'):  # relative link to #id
                pass
            else:
                href = url + '/' + href
            tag['href'] = href
    idx = 0
    # for tag in soup.findAll('html'):
    #     tag['xmlns'] = "http://www.w3.org/1999/xhtml"
    for tag in soup.findAll('img'):
        src = tag.get('src')
        ext = src[-3:]
        if ext == 'png' or ext == 'jpg':
            if src.startswith('http'):
                pass
            elif src.startswith('//'):
                src = 'http:' + src
            elif src.startswith('/'):
                src = base_url + src
            else:
                src = url + '/' + src
            img_name = 'img_' + str(file_idx) + '_' + str(idx) + '.' + ext
            # format: images/img_0_0.png
            tag['src'] = '../' + get_img(epub, epub_path, src, img_name)
            del tag['srcset']
            idx += 1
    html = str(soup)
    body = re.compile(r'<body\b[^>]*>', re.I)  # <body attributes>-tag
    html = body.sub('<body><h1>' + doc.title() + '</h1>', html)
    head = re.compile(r'<html\b[^>]*>', re.I)  # <html attributes>-tag
    html = head.sub(
        '<html xmlns="http://www.w3.org/1999/xhtml"><head><title>' +
        doc.title() +
        '</title><link href="../css/epub.css" rel="stylesheet" type="text/css"/></head>',
        html)
    # print(html[:300])

    doctype = '''<?xml version='1.0' encoding='utf-8'?>
        <!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>
        '''
    # html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">' + html
    html = doctype + html
    html = html.encode('utf-8')
    return html, doc.title()
Beispiel #31
0
def searchOndDay(curDate, howManyNewsOneDay=5, fiddler=None):
    # 搜索引擎
    searchEngine = 'bing.com'  # 这个不能改,因为后边的完整url是定制的
    # 计算URL
    delta = datetime3.date(2019, 11, 1) - curDate
    deltaNum = 18201 - delta.days
    print('搜素日期:', curDate)
    searchUrl = r"https://cn.bing.com/search?q=737max%e7%a9%ba%e9%9a%be&filters=ex1%3a%22ez5_" + \
        str(deltaNum) + "_" + str(deltaNum) + \
        r"%22&redir=2&frb=1&qpvt=737max%e7%a9%ba%e9%9a%be"
    print('搜索Url:', searchUrl)

    # 发送一个http请求并接收结果
    r = requests.getPlus(searchUrl, verify=fiddler)
    # 判断http请求是否正确返回
    if r.status_code != 200:
        print('error:搜索页状态码异常')
        return 0
    # 获取返回html文本
    '''r.encoding = "utf-8"  # 因为是针对bing,我们知道编码肯定是utf-8'''
    searchHtml = r.text
    # 判断返回中是否有查询结果,判断是否被ban
    t = re.findall(r'条结果', searchHtml, re.I)
    if t == []:
        print('error:被ban了')
        return 0
    else:
        t = re.findall(r'\d+(?= 条结果)', searchHtml, re.I)
        t = t[0]
        print('搜索结果共几条:', t)
    # 解析searchHtml
    tree = etree.HTML(searchHtml)
    # 真正有效的新闻有几条(不算视频集和图片集)
    newsList = tree.xpath(
        '/html/body[1]/div[1]/main[1]/ol[1]/li[@class="b_algo"]')
    newsNum = len(newsList)
    print('真正有效的新闻共几条:', newsNum)
    # 保存搜索页
    file = open("./corpora/" + searchEngine + '_' + str(curDate) + '.html',
                "wb")
    file.write(searchHtml.encode('utf-8'))
    file.close()

    # 循环(howManyNewsOneDay)条真正有效的新闻
    newsIndex = 0  # 注意是从1开始的,因为以上来就+=1(历史原因,懒得改了)
    howManyNewsSaved = 0
    while howManyNewsSaved < howManyNewsOneDay:
        newsIndex += 1
        # 如果总共都不够那么多条,那及时退出
        if newsIndex > newsNum:
            break
        print('  第%d个新闻' % newsIndex)
        # 取出当前新闻的相关信息
        news = newsList[newsIndex - 1]
        titleElement = news.xpath('./h2/a')
        # 判断是否网页新闻(有可能是ppt,pdf)
        if titleElement == []:
            print('    新闻可能是文件形式,不算数')
            continue
        titleElement = titleElement[0]
        newsUrl = titleElement.attrib['href']
        print('    网址:', newsUrl)
        newsTitle = titleElement.text
        print('    标题:', newsTitle)
        introduction = news.xpath('string(./div[1]/p[1])')
        print('    简介:', end='')
        print(indent(introduction, length=40, fIndent=0, lIndent=10))
        newsTime = re.findall(r'^\d+-\d+-\d+', introduction, re.I)[0]
        newsTimeYear = int(re.findall(r'^\d+(?=-)', newsTime, re.I)[0])
        newsTimeMonth = int(re.findall(r'(?<=-)\d+(?=-)', newsTime, re.I)[0])
        newsTimeDay = int(re.findall(r'(?<=-)\d+$', newsTime, re.I)[0])
        print('    发布时间:', newsTime)
        newsId = searchEngine + '_' + str(curDate) + '_' + str(newsIndex)
        print('    Id:', newsId)

        # 判断是否文字新闻,是否合格
        host = re.search('(?<=://)\S+?(?=/)', newsUrl).group()
        if host in [
                'www.yunjuu.com',
                'v.qq.com',
                'www.bilibili.com',
                'v.youku.com',
                'haokan.baidu.com',
        ]:
            print('    新闻不合格,这个不算数')
            continue

        # 访问新闻网页
        try:
            r = requests.getPlus(newsUrl, verify=fiddler)
        except Exception as e:
            print('    这个新闻网站跪了,不算数:', e)
            continue
        # 是否返回成功
        if r.status_code != 200:
            print('    error: 状态码非200,不算数')
            continue
        # 获取返回html文本
        '''r.encoding = "utf-8"'''
        newsHtml = r.text
        # 去掉html中的回车和多余空格
        newsHtml = newsHtml.replace('\n', '')
        newsHtml = newsHtml.replace('  ', '')
        # 用readability抽取主要信息
        newsdoc = Document(newsHtml)
        newsTitle = newsdoc.title()
        print('    标题:', newsTitle)
        newsContentWithTags = newsdoc.summary()  # readability包的处理结果是带着html标签的
        # 去掉html标签,得到纯文本
        newsContent = html2text(newsContentWithTags)
        # 输出content
        print('    正文:', end='')
        print(indent(newsContent, length=40, fIndent=0, lIndent=10))

        # 判断是否文字新闻,是否合格
        if len(newsContent) < 270:
            print('    新闻不合格,这个不算数')
            continue

        # 插入数据库
        SysDb.insertRow(
            'websiteTabel',
            {
                '搜索引擎': searchEngine,
                '搜索日期年': curDate.year,
                '搜索日期月': curDate.month,
                '搜索日期日': curDate.day,
                '搜索网址': searchUrl,
                '搜索html': searchHtml,
                '新闻序号': newsIndex,
                '新闻ID': newsId,
                '新闻网址原': newsUrl,
                '新闻网址真': r.url,
                '新闻html': newsHtml,
                '新闻标题': newsTitle,
                # '新闻作者': {'类型': '文本', '初始值': None, '主键否': '非主键'},
                # '新闻机构': {'类型': '文本', '初始值': None, '主键否': '非主键'},
                '新闻日期年': newsTimeYear,
                '新闻日期月': newsTimeMonth,
                '新闻日期日': newsTimeDay,
                '新闻正文': newsContent
            })
        # 保存了一个,计数加一
        howManyNewsSaved += 1
    def parseitem(self,response):
        ':type response: Response'

        if 'Please turn on JavaScript' in response.body:
            body = response.body
            body = re.sub('<p class="caption"[^<]+', '', body)
            body = re.sub('<noscript>(.|\r|\n)*?</noscript>','',body)

            response = response.replace(body=body)

        sel =  Selector(response)
        item = NewsscraperItem()

        
        ### storing the name of URL and source in item dictionary
         
        item['url']= response.url
        item['source']= self.name
        
        ### extracting the time of scraping of data inside the item
        
        item['dateScraped']= strftime("%Y-%m-%d %H:%M:%S", gmtime())
        
        ### checking for url category defined in allowed domain. If response.url contain the string then move in otherwise in else condition
        try:  
            if 'www.bbc.co.uk' in response.url:
                
                ### extracting title
                title = sel.xpath("//h1[starts-with(@class,'story')]/text()").extract()
                if(title):
                    
                    ### extracting title from the page and checking different xpath for searching the title
                    
                    item['title']=title[0].strip()
                    
                    ### extracting date from the page using xpath
                    
                    d = sel.xpath("//span[@class='date']/text()").extract()[0].strip()
                    
                    ###string to datetime conversion
                    
                    f = strptime(d,'%d %B %Y')
                    
                    ###formating date in a particular format defined in config file  
                    
                    item['date']= strftime(Config['dateformat'],f)
                
                    ### extracting content from the page
                    
                    x = sel.xpath("(//div[@class='story-body']//*[self::p or self::strong]/text()) |(//span[@class='cross-head']/text())|(//div[@class='story-body']/p/a/text())").extract()
                    if len(x) > 1:
                        st="\n"
                        p = st.join(x)
                        
                        ### using regular expression to remove continuous white spaces from the content and replace by single space
                        
                        item['content']= re.sub(r"[ \t\n]+", " ",p)
                    else:
                        # Not able to extract article content using xpath. Move to backup approach and use readability
                        try:
                            html = sel.xpath("//div[@class = 'story-body']").extract() 
                            doc = Document(html)
                            doc.options['debug'] = False
                            
                            try:
                                logging.basicConfig(level=logging.CRITICAL)

                                htmlContent = doc.summary()                    
                                content = html2texthandler(htmlContent)
                            except Exception, e:
                                pass
                            finally:
                                logging.basicConfig(level=logging.INFO)

                            item['content']= re.sub(r"[ \t\n\"]", " ",content)
                        except:
                            return 
Beispiel #33
0
import requests
from readability import Document

# https://github.com/buriy/python-readability
# response = requests.get('http://example.com')
# response = requests.get('http://usosdelasticsenlaadministracion.blogspot.com/')
# https://williamjturkel.net/2013/06/15/basic-text-analysis-with-command-line-tools-in-linux/
response = requests.get('http://www.eumed.net/ce/2015/1/tecnologia.html')
doc = Document(response.text)
# print(doc.title())
content = doc.summary()
# print(doc.summary())
file = open('tecnologia.html', 'w')
file.write(content)
file.close()
Beispiel #34
0
def get_summary(content):
    doc = Document(content)
    summary = doc.summary(html_partial=True)
    return summary
Beispiel #35
0
 def html_select(raw_html, xpath_lan):
     doc = Document(raw_html)
     summary_html = doc.summary()
     # print(summary_html)
     selector = etree.HTML(summary_html)
     return selector.xpath(xpath_lan)
Beispiel #36
0
def getContent():
    
      
            
            """收集内容"""
            """ 你的 APPID AK SK """
            APP_ID = '14658509'
            API_KEY = 'C14bCL7NkReQpak382maUYXi'
            SECRET_KEY = '8vWAXHBTmfL3r96PlKIggpwuXwdNl4wz'
            client = AipNlp(APP_ID, API_KEY, SECRET_KEY)
            #[1网址,2标题,3内容,4情感分析items词典,5公司名列表,6评论观点列表,7文章分类,8文章标签]
            #http://linyi.iqilu.com/caijing/2018/1117/4113682.shtml
            #monitor_result=[]
            for news_url in urls:                
                one_monitor=[]
                one_monitor.append(news_url)#①网址
                try:#确保一条新闻具有完整性
                    news=urlopen(news_url,timeout=15)#设置timeout后,urlopen不会一直等待网址响应、也就不会出现卡死现象
                    news_html=news.read()#str类型的网页源码,这条指令和parse冲突,不能同时运行
                    #response = requests.get('http://example.com')
                    #doc = Document(response.text)
                except:
                    one_monitor.append("urlopen_error")
                    monitor_result.append(one_monitor) 
                    success_num +=1
                    print("打开网址错误")
                    continue
                try:#③内容,评论观点抽取最大就3000字
                    news_contents=Document(news_html)    
                    news_title=news_contents.title().strip(" ")[:39].encode("utf-8")#②标题,此处如果用默认的ascii转码、由于超出范围会报错
                    #print(news_title)#则删除空白符(包括'\n', '\r',  '\t',  ' ')
                    one_monitor.append(news_title)
                    news_content=BeautifulSoup(news_contents.summary()).get_text().strip(" ")[:2000].encode("utf-8")
                    #len(news_content)#print(news_content)
                    one_monitor.append(news_content)
                    emotion_content=news_content.decode("utf-8")[:500].encode("utf-8")#要防止str只截取定长字节而有不完整汉字
                    #print(emotion_content)
                except:
                    one_monitor.append("extract_error")
                try:             
                    #print(emotion_content)  #print(u"我很高兴"[:1000])#我很高兴
                    emotion=client.sentimentClassify(emotion_content)["items"]#④情感
                    one_monitor.append(emotion)
                except:
                    one_monitor.append("emotion_error")
                try:#⑤机构名列表
#                    ids = [1,4,3,3,4,2,3,4,5,6,1]
#                    list(set(ids))#结果是重新排序的
                    orgs=[item["item"].encode("utf-8") for item in client.lexer(news_content)["items"] if item["ne"] =="ORG"]
                    one_monitor.append(";".join(list(set(orgs))))
                    #print(";".join(list(set(orgs))))
                except:
                    one_monitor.append("org_error")
                try:#⑥评论观点列表
                    conments=[item['abstract'].encode("utf-8") for item in client.commentTag(news_content)['items']]
                    one_monitor.append(";".join(list(set(conments))))
                    #print(";".join(list(set(conments))))
                except:
                    one_monitor.append("comment_error")
                try:#⑦文章分类
#                    a=[[1,2],[4,3,5]]
#                    [c for b in a for c in b]
                    group=client.topic(news_title, news_content)["item"].values()#[[字典],[字典]]
                    #group=client.topic("对严重失信者,能否限制其发预付卡?法学家谈如何破解预付卡立法瓶颈", news_content)["item"].values()
                    value_list=[dic[u'tag'] for dic_list in group for dic in dic_list]#float类型不能参与join
                    one_monitor.append(u";".join(value_list).encode("utf-8"))
                    #print(u";".join(value_list).encode("utf-8"))
                except:
                    one_monitor.append("topic_error")
                try:#⑧文章标签
                    keyword=client.keyword(news_title, news_content)["items"]#[字典]
                    #keyword=client.keyword("对严重失信者,能否限制其发预付卡?法学家谈如何破解预付卡立法瓶颈", news_content)["items"]
                    key_list=[dic[u'tag'] for dic in keyword]
                    one_monitor.append(u";".join(key_list).encode("utf-8"))                   
                    #print(u";".join(key_list).encode("utf-8"))
                    print("成功%s"%success_num)
                except: 
                    one_monitor.append("keyword_error")
                    error_num +=1
                    print("其中有误%s"%error_num)                                   
                    
                monitor_result.append(one_monitor) 
                success_num +=1
                #time.sleep(1)
                
                if success_num % 200 == 0:#要定期保存,防止功亏一篑
                    with open("./temp/risk_monitoring%s.csv"%index,"w") as reader:
                        writer = csv.writer(reader)
                        writer.writerows(monitor_result)
Beispiel #37
0
def content(link):
    target = urllib.urlopen(link)
    d = Document(input=target)
    # catching if not u''
    return d.summary()
Beispiel #38
0
def ndtv_anti_ad_block_text(article):
    doc = Document(article.html)
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    return text_maker.handle(doc.summary())
                
                    ### extracting content from the page
                    try:
                        x = (sel.xpath("(//div[@class='story-inner']/p/text()) |(//h2[@class='heading']/text())").extract())
                        joiner = "\n"
                        p = joiner.join(x)
                    
                        ### using regular expression to remove continuous white spaces from the content and replace by single space
                    
                        item['content']= re.sub(r"[ \t\n]+", " ",p)
                    except:
                        try:
                            html = sel.xpath("//div[@class = 'story-body']").extract() 
                            logging.basicConfig(level=logging.CRITICAL)
                            doc = Document(html)
                            htmlContent = doc.summary()                    
                            content = html2texthandler(htmlContent)
                            item['content']= re.sub(r"[ \t\n\"]", " ",content)
                        except:
                            print"image coontent"
                            return 
                        finally:
                            logging.basicConfig(level=logging.INFO)

                    if (item['content']!="" and item['title']!="" and item['date']!="" ):
                        return item
                    else:
                        return
            ### if url is of second category then else condition take care of that
               
            else:
Beispiel #40
0
def getAndParse(url):
    # 跳过的host
    continue1 = False

    redUrl = getRedictedUrl(url)

    for ig in ignores['hosts']:
        if ig in url or (redUrl and ig in redUrl):
            continue1 = True
            break
    if continue1:
        return None, None

    try:
        newContent, redUrl = getContentAndRedictedUrl(url)

    except Exception as e:
        print 'new content1', e
        try:
            newContent, redUrl = getContentAndRedictedUrl(url)

        except Exception as e:
            print 'new content1', e
            return None, None

        except requests.exceptions.ConnectionError as er:
            print 'new content2', er
            return None, None

    except requests.exceptions.ConnectionError as er:
        print 'new content2', er
        try:
            newContent, redUrl = getContentAndRedictedUrl(url)

        except Exception as e:
            print 'new content1', e
            return None, None

        except requests.exceptions.ConnectionError as er:
            print 'new content2', er
            return None, None

    if not redUrl:
        return None, None

    # 对跳转后的url,再过滤一遍
    # continue2 = False
    # for ig in ignores['hosts']:
    #     if ig in redUrl:
    #         continue2 = True
    #         return None
    #
    # if continue2:
    #     return None

    urlHost = urlparse(redUrl).hostname

    new2 = newContent.encode('utf-8')
    # soup = getSoupByStr(newContent)
    soup = getSoupByStrEncode(new2, "utf-8")

    # 统一清理通用rm

    for rm in rules['common']['rm']:
        removeNodesFromSoup(rm, soup)  # 删除停止node

    needAutoExtract = True

    if rules.has_key(urlHost):
        contentRule = rules[urlHost]['content']
        if contentRule:  # 有配置正文规则
            specContent = soup.select(contentRule)  # 根据配置,抽取正文
            if specContent and len(specContent) > 0:
                del specContent[0].attrs
                soup = specContent[0]
                needAutoExtract = False
                # 现规则多直接按标签删除,所有,只有找到content才清楚rm配置的选项
                if rules[urlHost]['rm'] and len(rules[urlHost]['rm']) > 0:
                    for rm in rules[urlHost]['rm']:
                        removeNodesFromSoup(rm, soup)  # 删除停止node

        unwrapUseless(soup)



        content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \
            .replace(u'</div>', '').replace(u'<div>', '')

    else:  # m没有配置任何规则,自动抽取正文
        # print urlHost, ' : ',url
        # return None

        attemp = soup.select('#content')  #很多小说网站正文都是#content
        if attemp and len(attemp):
            #猜中了
            needAutoExtract = False
            unwrapUseless(soup)
            content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \
                .replace(u'</div>', '').replace(u'<div>', '')
        # else:
    if needAutoExtract:
        unwrapUseless(soup)

        doc = Document(unicode(soup).encode(
            'utf-8'))  #可能会报这个错误Expected a bytes object, not a unicode object
        content = doc.summary(html_partial=True)
        # content = content.replace('<html>','').replace('</html>','')

    newContent2 = cleanTailHead(urlHost, content)
    if newContent2 != content:
        content = newContent2

    if content and len(content) < 10:
        return None, None

    # newSoup = getSoupByStr(content)
    # newSoup.select('div')[0].unwrap()

    # content = unicode(newSoup).replace(u'<body>','').replace(u'</body>','')
    # content = content.replace(r'<p>\d+、.*</b></p>', '')

    # content = re.sub(u'<p>\d+、((?:.|\n)*?)</p>', "", content, 1)
    content = content.replace(u'�', u'')
    content = content.replace(u'\'', r'\'')
    return content, urlHost
three = "https://article.hareruyamtg.com/article/48018/?lang=en"

headers = {
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
response = requests.get(two, headers=headers)
doc = Document(response.text)

h = html2text.HTML2Text()
h.ignore_links = True
text = h.handle(response.text)

print(text)

cleaned_content = cleanhtml(doc.summary())
sentences = text.split(".")

for sent in sentences:
    for card in CardNames:
        match = sent.find(card)
        if match != -1:
            le = len(card)
            x = match.__index__()
            print(f"\n{10*'#'}")
            print("Match:", f"{Fore.RED}{sent[x:x + le]}{Style.RESET_ALL}")
            print(
                "Corpus:", sent[:x - 1],
                f"{Fore.RED}{sent[x:x + le]}{Style.RESET_ALL} {sent[x+le+1:]}.",
                "\n")
            print(f"{10*'#'}\n")
Beispiel #42
0
 def test_not_self_closing(self):
     sample = '<h2><a href="#"></a>foobar</h2>'
     doc = Document(sample)
     assert (
         '<body id="readabilityBody"><h2><a href="#"></a>foobar</h2></body>'
         == doc.summary())
Beispiel #43
0
def link2html(link):
    response = requests.get(link)
    doc = Document(response.text)
    return doc.title(), doc.summary()
Beispiel #44
0
 def test_too_many_images_sample_html_partial(self):
     """Using the too-many-images sample, make sure we still get the article."""
     sample = load_sample("too-many-images.sample.html")
     doc = Document(sample)
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="post-body', res[0:26])
 def test_too_many_images_sample_html_partial(self):
     """Using the too-many-images sample, make sure we still get the article."""
     sample = load_sample('too-many-images.sample.html')
     doc = Document(sample)
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="post-body', res[0:26])
Beispiel #46
0
                        publication = field.text

                if body and re.search('Redistribution rights for this field are unavailable',body) and len(body) < 100:
                    print("    Warning: no redistribution rights available for that article")
                    body = "<p><b>Redistribution rights for this article were not available.</b></p>"

            except (ArticleMissing, ArticleAccessDenied) as e:
                print("    Warning: couldn't fetch that article")
                headline = link_text
                body = "<p><b>The Guardian Open Platform returned an error for that article: {0}</b></p>".format(e)
                body += '<p>You can still try <a href="{0}">the original article link</a></p>'.format(link_url)

                force_request = requests.get(link_url)
                force_article = Document(force_request.text)

                body += cleaner.clean_html(force_article.summary())

            page_filename = "{0:03d}.html".format(page_number)

            html_body = E.body(E.h3(headline))

            if byline:
                html_body.append( E.h4('By '+byline) )
            html_body.append( E.p('[{s}]'.format(s=section)) )

            if standfirst:
                standfirst_fragments = fragments_fromstring(standfirst)
                standfirst_element = E.p( E.em( *standfirst_fragments ) )
                html_body.append( standfirst_element )

            if thumbnail:
Beispiel #47
0
from email.mime.text import MIMEText
from email.utils import formatdate
from email import encoders

# Get the html
response = requests.get("http://URL/TO/CONVERT")

# Clean up the html using readability
doc = Document(response.text)

# Use the webpage title as base file name
file_name = re.sub(r'[^a-zA-Z0-9]+', '-', doc.title())

# Write the html response to local file
f = open(file_name + '.html', 'w')
f.write(doc.summary())
f.close()

# Convert the local html file to .mobi
call(["./kindlegen", file_name + '.html'])

# Send the document as email attachment
msg = MIMEMultipart()
send_from = msg['From'] = '*****@*****.**'
send_to = msg['To'] = '*****@*****.**'  # Can be 'Send to Kindle' email
msg['Date'] = formatdate(localtime=True)
msg['Subject'] = file_name + ".mobi"

# Attache email body
msg.attach(
    MIMEText('Want to write a customized email boddy? Then put it here.'))
Beispiel #48
0
def get_content_from_url(url):
    def srcrepl(base_url, match):
        absolute_link = urljoin(base_url, match.group(3))
        absolute_link = '/link?url=' + absolute_link
        return "<" + match.group(1) + match.group(
            2) + "=" + "\"" + absolute_link + "\"" + match.group(4) + ">"

    def relative_to_absolute_urls(fragment, base_url):
        p = re.compile(r"<(.*?)(src|href)=\"(?!http)(.*?)\"(.*?)>")
        absolute_fragment = p.sub(partial(srcrepl, base_url), fragment)
        return absolute_fragment

    file_cache = f'./cache/sites/{get_cache_key(url)}.html'

    if not path.exists(file_cache):
        response = requests.get(url)
        text = response.text
        with open(file_cache, 'w') as f:
            f.write(text)
    else:
        with open(file_cache) as f:
            text = str(f.read())

    doc = Document(text)
    summary = doc.summary(html_partial=True)

    if 'wikipedia.org' in url:
        d = pq(summary)
        to_remove = [
            "#External_links", "#General_information", "#Experiments",
            "#Online_lectures", '.spoken-wikipedia', '#Bibliography', '.book',
            '.refbegin', '.shortdescription', '.reference', '.infobox',
            '.reflist', '#References', '#Further_reading', '#See_also',
            '.mw-editsection', '.tright'
        ]

        def check_link(index, a):
            da = pq(a)

            if da.attr('href') and '#cite_' in da.attr('href'):
                da.remove()

        d('a').each(check_link)

        for selector in to_remove:
            d(selector).remove()

        summary = d.html()

    try:
        parsed_url = urlparse(url)
        base_url = parsed_url.scheme + '://' + parsed_url.netloc
        summary = relative_to_absolute_urls(summary, base_url)
    except:
        pass

    soup = BeautifulSoup(summary, features="lxml")
    content = soup.get_text().rstrip('\n')
    content = re.sub(r'\n+', '\n', content).strip()

    return summary, content, doc.title()
#!/usr/bin/python
import requests
from readability import Document
import sys
from markdownify import markdownify as md

bookmarkDir = "/home/ironman/obsidians/personalObsidian/bookmarks/"

if len(sys.argv) > 1:
    url = sys.argv[1]
    response = requests.get(url)
    doc = Document(response.text)

    fileName = doc.title() + ".md"
    fileName = fileName.replace('/', ' ')

    markdownSummery = md(doc.summary())
    markdown = "# {} \n\n *{}* \n\n {}".format(doc.title(), url,
                                               markdownSummery)

    with open(bookmarkDir + fileName, 'w') as the_file:
        the_file.write(markdown)

else:
    print("please enter a url to make article view")
Beispiel #50
0
    for row in results:
        # content = row[1]
        # content = row[4].replace('mi', 'mo')
        id = row[0]
        # url = row[1]
        url = 'http://www.3dllc.com/html/37/37023/9515879.html'

        # if not u'easou' in url:
        #     continue

        newContent = getContent(url)

        doc = Document(newContent)

        content = doc.summary(html_partial=True)

        #
        # soup = getSoupByStr(newContent)
        #
        # ps = soup.select('#chapterContent')[0]
        # # ps.select('div')[0].unwrap()
        # # ps.unwrap()
        # for water in soup.select('.watermark'):
        #     water.extract()

        #
        # t = soup.select('p')[0]
        # title = t.get_text()
        # if re.match('\d+.*',title):
        #     # if id < 1766:
Beispiel #51
0
def handle_data():
    def cleancap(raw_cap):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_cap)
        tmp = cleantext.split('\n')
        cap = list()
        pre = ''
        for line in tmp:
            if line.replace(' ', '') and line != pre:
                if '-->' in line: cap.append('')
                else: pre = line
                cap.append(line)
        tmp = set()
        for idx in range(len(cap)):
            if '-->' in cap[idx] and (idx >= len(cap) - 2
                                      or '-->' in cap[idx + 2]):
                tmp.add(idx)
                tmp.add(idx + 1)
        final = list()
        for idx in range(len(cap)):
            if idx not in tmp: final.append(cap[idx])
        return '\n'.join(final)

    user_level = request.form['user_level']
    title = ''
    publish_date = ''
    text = request.form['text']
    if (text.startswith('http://www.youtube.com')
            or text.startswith('http://youtube.com')
            or text.startswith('http://youtu.be')
            or text.startswith('https://www.youtube.com')
            or text.startswith('https://youtube.com')
            or text.startswith('https://youtu.be')):
        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'skip_download': True,  # We just want to extract the info
            'outtmpl': 'download/target'  # file_path/target
        }
        file = ''
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([text])
            dirPath = "download"
            fileList = os.listdir(dirPath)
            if 'target.en.vtt' in fileList:
                file = cleancap(open('download/target.en.vtt').read())
            else:
                file = 'There is no english substitle in this video!'
            for fileName in fileList:
                if os.path.isfile(os.path.join(dirPath, fileName)):
                    os.remove(os.path.join(dirPath, fileName))
        v_id = text.split('=')[-1]
        content = [v_id, file]
        type_ = 'youtube'
        r = requests.get(text)
        if r.status_code < 400:
            title = BeautifulSoup(r.text, 'html.parser').find('title').text
            publish_date = BeautifulSoup(r.text, 'html.parser').find(
                'meta', itemprop="datePublished")['content']
    elif text.startswith('http://') or text.startswith('https://'):
        response = requests.get(text, headers=headers)
        doc = Document(remove_sometag(response.text))
        title = doc.short_title()
        publish_date = getPublishDate(response.content.decode('UTF-8'))
        content = doc.summary()
        type_ = 'url'
    else:
        content = text
        type_ = 'text'

    content = clean_content(content, type_)
    new,pure_text,vocab_dict = create_article(title, user_level, content, type_=='youtube', \
                         set(dictWord['V'].keys()), set(dictWord['N'].keys()), set(dictWord['ADJ'].keys()))
    store(pure_text, vocab_dict, user_level)
    return render_template('format.html', title=title, publish_date=publish_date, \
                           user_level=user_level, content=new)
Beispiel #52
0
def get_article_body(article, feed):

    body = ""

    # If scrape, get article with readability
    if feed["scrape"]:

        headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"
        }
        response = requests.get(article.link, headers=headers)
        doc = Document(response.text)
        body = doc.summary()

    # Else construct from article object
    else:

        # Add all content to body
        if hasattr(article, "content"):
            for c in article.content:
                if c.type == "text/html" or c.type == "text/plain":
                    body += c.value
        # Use summary as fallback
        elif hasattr(article, "summary"):
            body += article.summary

    # Replace relative links with absolute ones, using beautifulsoup
    try:
        splitted_url = urlsplit(article.link)
    except Exception:
        splitted_url = urlsplit(feed["url"])

    soup = BeautifulSoup(body, features="lxml")

    for img in soup.find_all("img", src=True):
        src = img.get("src")
        splitted_src = urlsplit(src)
        constructed_src = [
            splitted_src.scheme,
            splitted_src.netloc,
            splitted_src.path,
            splitted_src.query,
            splitted_src.fragment,
        ]
        if constructed_src[0] == "":
            constructed_src[0] = splitted_url.scheme
        if constructed_src[1] == "":
            constructed_src[1] = splitted_url.netloc
        new_src = urlunsplit(constructed_src)
        if new_src.startswith("http"):
            body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1)

    for a in soup.find_all("a", href=True):
        href = a.get("href")
        splitted_href = urlsplit(href)
        constructed_href = [
            splitted_href.scheme,
            splitted_href.netloc,
            splitted_href.path,
            splitted_href.query,
            splitted_href.fragment,
        ]
        if constructed_href[0] == "":
            constructed_href[0] = splitted_url.scheme
        if constructed_href[1] == "":
            constructed_href[1] = splitted_url.netloc
        new_href = urlunsplit(constructed_href)
        if new_href.startswith("http"):
            body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1)

    return body
Beispiel #53
0
def open_anything(source, type_arr, encode=None, logger=None):
    """URI, filename, or string --> stream

    This function lets you define parsers that take any input source
    (URL, pathname to local or network file)
    and deal with it in a uniform manner.  Returned object is guaranteed
    to have all the basic stdio read methods (read, readline, readlines).
    Just .close() the object when you're done with it.

    Examples:
    >>> from xml.dom import minidom
    >>> sock = open_anything("http://localhost/kant.xml")
    >>> doc = minidom.parse(sock)
    >>> sock.close()
    >>> sock = open_anything("c:\\inetpub\\wwwroot\\kant.xml")
    >>> doc = minidom.parse(sock)
    >>> sock.close()

    This function is part of "Dive Into Python", a free Python book for
    experienced programmers.  Visit http://diveintopython.org/ for the
    latest version.

    """

    print('\nopen_anything ({})...'.format(source))
    if logger is None:
        logger = create_log(log_name="util")  # ,level=loglevel)
        pass

    if hasattr(source, "read"):
        print("Dealing with text...")
        type_arr[0] = "text"
        return source
        pass

    if source == "-":
        return sys.stdin
        pass

    if (source.startswith('http://') or source.startswith('https://') or
            source.endswith('.html') or source.endswith('.htm')):
        # try to open with urllib2 (if source is http, ftp, or file URL)
        # import urllib2
        print("Dealing with html...")
        type_arr[0] = "html"
        driver = None

        h = 'http'
        if not os.path.exists(h):
            os.mkdir(h)
        head = source.split("/")
        head = head[len(head) - 1]
        head = ''.join([h, '/', os.path.splitext(head)[0]])
        txtname = ''.join([head, '.txt'])
        # print(txtname) # c

        if os.path.exists(txtname):
            print("{0} already exists,".format(txtname),
                  "\njust read from it; stop getting http")
            f = open(txtname, 'r', encoding='utf-8')
            text = f.read()
            f.close()
            return text

        try:
            # return urllib2.urlopen(source, timeout=10)
            response = requests.get(source, timeout=50)
            len_text = len(response.text)
            if len_text > 0:
                doc = Document(response.text)
                res = doc.summary()
                res = cleanhtml(res)
                len_res = len(res)
                if len_res > 0:
                    f = open(txtname, 'w', encoding='utf-8')
                    f.write(res)
                    f.close()
                    print("http to txt, save in {0}".format(txtname))
                else:
                    print("Something wrong!")
                    print("len_text:", len_text)
                    print("len_res:", len_res)
                return res

            driver = webdriver.Chrome()
            '''
            if platform == 'darwin':
              driver = webdriver.Safari()
            elif platform == 'win32':
              #driver = webdriver.Firefox()
              driver = webdriver.Chrome()
            '''

            driver.set_page_load_timeout(50)
            print("\ngetting http : ", source)
            driver.get(source)
            time.sleep(6)
            res = driver.page_source
            driver.close()

            f = open(txtname, 'w', encoding='utf-8')
            f.write(res)
            f.close()
            print("http to txt, save in {0}".format(txtname))
            return res
            pass
        except TimeoutException as e:
            msg = "too much time to load html: {0}, info:{1}".format(source, e)
            print(msg)
            logger.info(msg)
            time.sleep(2)
            driver.get(source)
            time.sleep(13)
            res = driver.page_source
            driver.close()

            f = open(txtname, 'w', encoding='utf-8')
            f.write(res)
            f.close()
            print("http to txt, save in {0}".format(txtname))
            return res
            pass
        except (IOError, OSError) as e:
            msg = "failed to load html: {0}, info: {1}".format(source, e)
            logger.info(msg)
            pass
        except Exception as e:
            msg = "failed to load html: {0}, err: {1}".format(source, e)
            print(msg)
            logger.error(msg)
            pass
        finally:
            if driver is not None:
                driver.quit()

    """
    # try to open with pypdf(if source is pdf)
  
    # failed to open pdf
    # see
    # http://stackoverflow.com/questions/25665/ \
  python-module-for-converting-pdf-to-text
    from PyPDF2 import PdfFileReader
    try:
      type_arr[0]="pdf"
      return PdfFileReader(open(source,"rb"))
      pass
    except (IOError, OSError):
      pass
    """

    if not os.path.exists(source):
        print('\nfile not exist: {}'.format(source))
        return None

    if source.endswith('.pdf'):
        # try to open with pypdf(if source is pdf)
        print("Dealing with pdf...")
        type_arr[0] = "pdf"
        try:
            # print "using pdf method to open"
            return get_pdf_io(source)
            pass
        except (IOError, OSError) as e:
            msg = "failed to load pdf: {0}, info:{1}".format(source, e)
            logger.info(msg)
            pass

    # try to open with pywin32(if source is doc)
    # @todo use pywin32 to open doc

    # try to open with native open function (if source is pathname)
    type_arr[0] = "text"
    try:
        return open(source, encoding=encode)
        pass
    except (IOError, OSError) as e:
        msg = "failed to load txt: {0}, info:{1}".format(source, e)
        logger.info(msg)
        pass

    print('\nFailed to open_anything ({})'.format(source))
    return None
    pass
 def test_wrong_link_issue_49(self):
     """We shouldn't break on bad HTML."""
     sample = load_sample('the-hurricane-rubin-carter-denzel-washington.html')
     doc = Document(sample)
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="content__article-body ', res[0:39])
Beispiel #55
0
def get_main_html(html):
    doc = Document(html)
    return doc.summary()
 def test_nyt_sample_html_iframe(self):
     """Using the nyt sample, make sure the summary holds an <iframe> element (youtube video)"""
     sample = load_sample('nyt-article-video.sample.html')
     doc = Document(sample, url='http://nytimes.com/')
     res = doc.summary()
     self.assertTrue('<iframe ' in res)
Beispiel #57
0
    def save(self, *args, **kwargs):
        if self.description:
            document = Document(self.description)
            self.readable_description = document.summary(html_partial=True)

        return super(FeedItem, self).save(*args, **kwargs)
Beispiel #58
0
def get_article_body(url):
    page = requests.get(url, timeout=(3.05, 10))
    doc = Document(page.text)
    soup = BeautifulSoup(doc.summary(), 'html.parser')
    return soup.get_text()
Beispiel #59
0
import xml.etree.ElementTree
from readability import Document
from langdetect import detect

from models import Visit
from database import db_session

LANG = {
    'en': 'english',
    'fr': 'french'
}


def remove_tags(text):
    return ''.join(xml.etree.ElementTree.fromstring(text).itertext())


for v in Visit.query.all():
    doc = Document(v.raw_dom)
    v.extacted_text = remove_tags(doc.summary())
    v.lang = LANG.get(detect(v.extacted_text), 'simple')

db_session.commit()