Example #1
0
def translate_page(request: HttpRequest):
    """
    翻译页面,可以使用url或file_id参数
    :param request:
    :return:
    """
    url = request.GET.get('url')
    file_id = request.GET.get('file_id')
    if url:  # 如果提供了url
        if 'http' not in url:
            url = 'http://' + url
        try:
            # 获取网页
            f = urllib.request.urlopen(url)
            text = f.read().decode()
            article = Document(text)
            article.content()
            # 抽取文本
            html = article.get_clean_html()
            # 计算hash
            m = hashlib.md5()
            m.update(html.encode('utf-8'))
            hashed = m.hexdigest()
            try:  # 如果该用户相同的文本已经存在,读取并返回
                p = UserPreference.objects.get(hashed=hashed,
                                               user=request.user)
            except UserPreference.DoesNotExist:
                p = UserPreference(user=request.user,
                                   original=text,
                                   trans=text,
                                   hashed=hashed)
                p.save()
            text = base64.b64encode(p.trans.encode('utf-8')).decode()
            return render(request,
                          'translate.html',
                          context={
                              'text': text,
                              'hashed': hashed,
                              'inc': p.inc
                          })
        except Exception:
            return redirect('/translate_index?message=' + _("Cannot open url"))
    elif file_id:
        try:
            p = UserPreference.objects.get(pk=file_id)
            text = base64.b64encode(p.trans.encode('utf-8')).decode()
            return render(request,
                          'translate.html',
                          context={
                              'text': text,
                              'hashed': p.hashed,
                              'inc': p.inc
                          })
        except UserPreference.DoesNotExist:
            pass
    return redirect('/')
Example #2
0
def sitegetter(bots, update, args):
    url = args[0]
    raw = args[1] == "true"

    response = requests.get(url)

    if not sendFile:
        doc = Document(response.text)
        if raw is False:
            print('summary')
            text = doc.summary()
        else:
            text = doc.content()

        line = text
        n = 4000
        output = [line[i:i + n] for i in range(0, len(line), n)]

        for a in output:
            bots.send_message(chat_id=update.message.chat_id, text=a)
    else:
        with open('output.html', 'w+') as out:
            out.write(response.text)
            bots.send_document(chat_id=update.message.chat_id,
                               document=open('output.html', 'rb'))
Example #3
0
    def process_item(self, item, spider):
        '''
        DESCRIPTION:
        ------------
        For each news item, corresponding news text is extracted
        using python library 'readability'.

        RETURNS:
        --------
        News item with 'newsText' field updated is returned.
        '''
        try:
            response = requests.get(item['newsUrl'])
            doc = Document(response.text)
            content = Document(doc.content()).summary()
            h = html2text.HTML2Text()
            h.ignore_links = True
            articleText = h.handle(content)
            articleText = articleText.replace('\r', ' ').replace('\n',
                                                                 ' ').strip()
            item['newsText'] = articleText
        except Exception:
            raise DropItem("Failed to extract article text from: " +
                           item['newsUrl'])

        return item
Example #4
0
def main(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()

    document = Document(article.html)
    summary = document.summary()
    content = document.content()

    title = get_title(article)
    text = get_text(article)
    entities = get_entities(text)
    phrases = get_phrases(text, entities)
    keywords = get_keywords(article, phrases)
    urls_primary = get_urls(summary, url, [])
    urls_secondary = get_urls(content, url, urls_primary)

    return {
        'title': title,
        'text': text,
        'entities': entities,
        'keywords': keywords,
        'phrases': phrases,
        'urls': {
            'primary': urls_primary,
            'secondary': urls_secondary,
        },
    }
Example #5
0
 def parse_item(self, response):
     filename = hashlib.sha1(response.url.encode()).hexdigest()
     readability_document = Document(response.body, url=response.url)
     item = BeerReviewPage()
     item['url'] = response.url
     item['filename'] = filename
     item['depth'] = response.meta['depth']
     item['link_text'] = response.meta['link_text']
     item['title'] = readability_document.short_title()
     with open('data/' + filename + '.html','wb') as html_file:
         html_file.write(readability_document.content())
     print '(' + filename + ') ' + item['title'] + " : " + item['url']
     return item
Example #6
0
 def process(self, item, spider):
     try:
         response = requests.get(item['newsurl'])
         doc = Document(response.text)
         content = Document(doc.content()).summary()
         h = html2text.HTML2Text()
         h.ignore_links = True
         articltext = h.handele(content)
         articltext = articltext.replace('\r', ' ').replace('\n', ' ').strip()
         item['newstext'] = articltext
         
     except Exception:
         raise DropItem("extract article Failed from: " + item['newsurl'])
     return item
Example #7
0
def extract_article_info(text):
    """
    Gets simplified page from the text
    Uses readability module
    """
    doc = Document(text)
    # safe fetch title
    title = doc.short_title()
    if not title:
        title = doc.title()
    # content
    content = doc.summary(html_partial=True)
    image = get_page_image(doc.content())
    # return
    return {'title': title, 'content': content, 'image': image}
Example #8
0
    def get_article_text(self, response):
        '''
               DESCRIPTION:
               -----------
               * This function cleanse the page of superfluous content such as advertising and HTML

               PARAMETERS:
               ----------
                   1. response
        '''
        doc = Document(response.text)
        article_html = Document(doc.content()).summary()
        h = html2text.HTML2Text()
        h.ignore_links = True
        article_text = h.handle(article_html)
        article_text = article_text.replace('\r', ' ').replace('\n', ' ').strip()
        return article_text
Example #9
0
def extract_content_texts(name):
    article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles')
    json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles')
    mkdir_p(json_archive)
    for html in glob.glob(article_archive+'/*.html'):
        fname = os.path.basename(html)+'.json'
        savepath = os.path.join(json_archive, fname)
        if os.path.exists(savepath):
            logging.info('Skipping existing json data: {0}'.format(savepath))
            continue
        data = {}
        with open(html, 'r') as myfile:
            doc = Document(myfile.read())
            data['title'] = doc.title()
            data['content'] = doc.content()
            data['summary'] = doc.summary()
            with open(savepath, 'w') as saving:
                json.dump(data, saving)
Example #10
0
def extract_content_texts(name):
    article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles')
    json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles')
    mkdir_p(json_archive)
    for html in glob.glob(article_archive + '/*.html'):
        fname = os.path.basename(html) + '.json'
        savepath = os.path.join(json_archive, fname)
        if os.path.exists(savepath):
            logging.info('Skipping existing json data: {0}'.format(savepath))
            continue
        data = {}
        with open(html, 'r') as myfile:
            doc = Document(myfile.read())
            data['title'] = doc.title()
            data['content'] = doc.content()
            data['summary'] = doc.summary()
            with open(savepath, 'w') as saving:
                json.dump(data, saving)
Example #11
0
    def cleanDocument(self, text, theUrl):
        replaceChars = [
            ("“", '"'),
            ("”", '"'),
            ("‘", "'"),
            ("’", "'"),
            ("`", "'"),
            ("`", "'"),
            ("′", "'"),
            ("—", "-"),
            ("–", "-"),
            ("…", "..."),
            ("•", "."),
            ("«", '"'),
            ("»", '"'),
            ("„", '"'),
            ("μ", "micro"),
            ("™", "(TM)"),
            ("≤", "<="),
            ("≥", ">="),
            ("∀", "ForAll"),
            ("⇒", "=>"),
            ("б", "(6)"),
            ("š", "s"),
            ("├", "|-"),
            ("─", "--"),
            ("|", "| "),
            ("│", "| "),
            ("└", "-"),
            ("→", "->"),
            ("⁄", "/"),
            ("⅓", "1/3"),
            ("📸", "(camera)"),
            ("✅", "(x)"),
            ("👽", "(alien)"),
            ("👍", "(ok)"),
            ("🙀", "(oh)"),
            ("🚀", "(despegar)"),
            ("\\n",""),
            ("\\t",""),
        ]

        from readability import Document

        doc = Document(text)
        doc_title = doc.title()

        if not doc_title or (doc_title == "[no-title]"):
            if theUrl.lower().endswith("pdf"):
                title = getPdfTitle(response)
                print(title)
                doc_title = "[PDF] " + title

        theTitle = doc_title

        # myText = doc.summary()
        myText = doc.content()

        for a, b in replaceChars:
            myText = myText.replace(a, b)
            theTitle = theTitle.replace(a, b)

        return (myText, theTitle)
Example #12
0
def make_site_with_rssfeed_readable_again(url, filename, is_clean):
    """Convert feed to an HTML."""
    with open(filename, 'w') as file_object:
        print "\nOPENING URL: " + url + "\n\n"
        headers = {
            'User-Agent':
            APP_BRANDNAME + '/' + APP_RELEASE +
            ' (Unix; Intel OS Nine 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
        }

        response = requests.get(url, headers=headers)
        mystr = response.text

        # remove heigh and width in images because CSS will do that
        mystr = mystr.replace(u"height=", "whatever=")
        mystr = mystr.replace(u"width=", "whatever=")

        # remove unwanted strings in output
        mystr = mystr.replace(u'<hr id=', '<hr class="spenden" id=')
        mystr = mystr.replace(u"<p><strong>Hilf mit!</strong>", "")
        mystr = mystr.replace(
            u"Mit Deiner finanziellen Hilfe unterstützt Du unabhängigen Journalismus.",
            "")

        if APP_DEBUG:
            print "FEED:\n" + str(mystr) + "\n****************************"

        feedtitle = None
        try:
            root = parse_feed(mystr)
            entries = root.entries

            # access feedtitle
            feedtitle = root.feed.title
        except Exception, e:
            print "PARSING-ERROR: " + str(e)
            print(traceback.format_exc())
            pass

        if not feedtitle:
            feedtitle = DEFAULT_TITLE

        if is_clean:
            template = APP_PATH + '/' + 'template_clean.html'
        else:
            template = APP_PATH + '/' + 'template_readable.html'

        if APP_DEBUG:
            print "\n ENTRIES TO RENDER: " + str(len(entries)) + "\n"

        last_entry_link = entries[len(entries) - 1].link
        html_footer = site_footer_html()
        html_content = Template(filename=template,
                                output_encoding='utf-8').render(
                                    last_entry_link=last_entry_link,
                                    num_of_entries=len(entries),
                                    feedurl=url,
                                    entries=entries,
                                    feedtitle=feedtitle,
                                    footer=html_footer)

        if APP_DEBUG:
            print "HTML:\n" + html_content + "\n****************************"

        if is_clean:
            clean = Document(html_content)
            file_object.write(clean.content())

        else:
            file_object.write(html_content)
Example #13
0
    - ArticleExtractor
    - ArticleSentencesExtractor
    - KeepEverythingExtractor
    - KeepEverythingWithMinKWordsExtractor
    - LargestContentExtractor
    - NumWordsRulesExtractor
    - CanolaExtractor
"""

url = 'https://techcrunch.com/2017/02/13/mit-speech-chip/'  #BadStatusLine from boilerpipurle

url = "http://www.forbes.com/sites/trevorclawson/2017/02/23/finding-a-voice-can-a-uk-startup-compete-with-its-heavy-hitters-in-the-speech-recognition-market/"

url = "https://nakedsecurity.sophos.com/2017/03/03/researcher-uses-googles-speech-tools-to-skewer-google-recaptcha/"

url = "http://www.natureworldnews.com/articles/32595/20161123/microsoft-officially-makes-first-humanly-accurate-speech-recognition-tech.htm"

url = "http://www.businessinsider.com/ibm-edges-closer-to-human-speech-recognition-2017-3"
#ArticleExtractor = Extractor(extractor='ArticleExtractor', url=url)
#print "ArticleExtractor:\n" + ArticleExtractor.getText() + "\n"

ArticleSentencesExtractor = Extractor(extractor='ArticleSentencesExtractor',
                                      url=url)
print ArticleSentencesExtractor.getText()

article = Goose().extract(url=url)
print article.cleaned_text

document = Document(requests.get(url))
document.content()
import argparse
import requests
from readability import Document

parser = argparse.ArgumentParser()
parser.add_argument('-u',
                    '--url',
                    dest='url',
                    help='url of policy',
                    metavar='URL',
                    required=True)
parser.add_argument('-d',
                    '--dest',
                    dest='filepath',
                    help='file to save policy',
                    metavar='FILE',
                    required=True)

args = parser.parse_args()
url = args.url
filepath = args.filepath
print(url, filepath)

response = requests.get(url)
doc = Document(response.text)
doc.title()
with open(filepath, 'w') as fd:
    fd.write(doc.content())
Example #15
0
import requests
from readability import Document
from pprint import pprint

response = requests.get('https://laravel-news.com/announcing-building-a-chatbot-with-laravel-and-botman')

doc = Document(response.text)
# API methods:
# .title() -- full title
# .short_title() -- cleaned up title
# .content() -- full content
# .summary() -- cleaned up content
data = dict()
data['title'] = doc.title()
data['short_title'] = doc.short_title()
data['content'] = doc.content()
data['summary'] = doc.summary()


pprint( data )