Example #1
0
    def convert(self):
        if self.url:
            response = requests.get(self.url, headers=self.headers)
            content = response.content

        elif self.path:
            with open(self.path, 'rb') as f:
                content = f.read()

        soup = BeautifulSoup(content, 'html.parser')
        try:
            self.encoding = get_encoding(soup)
        except ValueError:
            self.encoding = soup.original_encoding

        doc = Document(content.decode(self.encoding, "ignore"))

        self.title = doc.title() if len(doc.title()) > 0 else "Awesome article"

        self.soup = BeautifulSoup(doc.summary(), 'html.parser')

        self.process_images()
        self.add_head()
        if len(self.soup.find_all("h1")) == 0:
            self.insert_title()
        if self.url:
            self.insert_link()
        self.save_html()

        self.convert_to_mobi()
        if self.send_by_mail:
            self.send_to_kindle()
        if self.clean:
            self.do_cleaning()
Example #2
0
def get_url(url_var):
    response = requests.get(url_var)
    tree = html.fromstring(response.content)
    doc = Document(response.text)
    # print(doc.title())
    tree_text = tree.xpath(
        '//p/text() | //p/a/text() |//p/b/text() | //div/text() | //h2/text() | //h1/text() | //h2/a/text() | //h3/text() | //h3/a/text()'
    )
    # tree_text = tree.xpath('//div/text()')
    # print(tree_text)
    # response=request.urlopen("https://en.wikipedia.org/wiki/Louis_Tomlinson")
    # print("URL:",response.geturl())

    # print("2:")
    text = ""
    file = open('text_with_b.txt', 'w+')
    for x in tree_text:
        if ("'b'" in x):
            x.strip("'b'")
        file.write(str(x.encode("utf-8")))
    file.close()

    # print("3:")
    s0 = '"b"'
    s1 = "'b'"
    s2 = "'b"
    s3 = "b'"
    # print(set(string.printable))
    # pattern = "\"(?=[<\"]+,)[>\"]+\""
    with open('text_with_b.txt', 'r') as infile, \
         open('text_document.txt', 'w') as outfile:
        data = infile.read()
        data = data.replace(s0, "")
        data = data.replace(s1, "")
        data = data.replace(s2, "")
        data = data.replace(s3, "")
        outfile.write(data)
    infile.close()
    outfile.close()

    print(doc.title())
    print()
    fs = FrequencySummarizer()
    f = open('result.txt', 'w+')
    f.write(str(doc.title() + '\n'))
    with open('text_document.txt', "r") as file:
        text = file.read()
        for s in fs.summarize(text, 4):
            f.write(str('*' + s))
            print()
    f.close()
    # starting summarizer
    # print("4:")
Example #3
0
 def parse_text2(self, url, doc):
     '''解析网页中的文本数据:正文,标题,发布时间,作者等'''
     article = Document(doc)
     try:
         text = article.summary()
         # 如果找不到当前页面的title,就把title设为当前页面的链接的title
         title = article.title() if article.title(
         ) else self.links_all_dict.get(url, '')
     except Exception as e:
         logger.warning('没有找到页面 {} 的title和text,原因 {}'.format(url, e))
         title, text = '', ''
     return title, text
Example #4
0
    def parse_item(self, response):

        if response.status == 200:
            # Extracting the content using css selectors
            urls = response.css('.media__link::attr(href)').extract()
            tag_texts = response.css('.media__tag::text').extract()
            tag_urls = response.css('.media__tag::attr(href)').extract()

            urls_cleansed = self.reconcile_url_base(urls)
            tag_urls_cleansed = self.reconcile_url_base(tag_urls)

            if len(urls_cleansed) != len(tag_urls_cleansed):
                raise Exception(
                    'Length Mismatch between article urls and tag urls')

            article_info_list = []

            for item in zip(urls_cleansed, tag_urls_cleansed, tag_texts):
                url = item[0]
                tag_url = item[1]
                tag_text = item[2]
                url_response = requests.get(url)
                doc = Document(url_response.text)
                soup = BeautifulSoup(url_response.text)
                date_info = soup.find('div', attrs={'class': 'date date--v2'})
                if date_info:
                    created_time_epoch = int(date_info['data-seconds'])
                    created_time_datetime = datetime.fromtimestamp(
                        created_time_epoch)
                else:
                    created_time_datetime = None

                title = doc.title()
                cleansed_body = doc.summary()
                # Cannot scrap created time with scrapy nor with readability, so use BeautifulSoup for that.
                body_soup = BeautifulSoup(cleansed_body)
                cleansed_article_text = ' '.join([
                    x.get_text().replace('\n', ' ')
                    for x in body_soup.find_all('p')
                ])
                cleansed_article_text = self.clean_article_text(
                    cleansed_article_text)

                itemm = BbcArticlesItem()
                itemm['title'] = doc.title()
                itemm['url'] = url
                #itemm['time']=created_time_datetime
                #itemm['type']=tag_url
                itemm['related_topics'] = tag_text
                itemm['article_text'] = cleansed_article_text

                yield itemm
Example #5
0
 def get(self):
     url = self.get_argument("url", None)
     # https://www.ifanr.com/1080409
     doc = Webcache.find_one({'url': url}, {'_id': 0})
     if doc:
         self.res = dict(doc)
         return self.write_json()
     try:
         sessions = requests.session()
         sessions.headers[
             'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
         response = sessions.get(url)
         # response.encoding = 'utf-8'  # TODO
         response.encoding = get_charset(response)
         doc = Document(response.text)
         title = doc.title()
         summary = doc.summary()
         markdown = html2text.html2text(summary)
         markdown = markdown.replace('-\n', '-')
         markdown = markdown.strip()
         res = {}
         res['url'] = url
         res['title'] = title
         res['markdown'] = markdown
         if title and markdown:
             webcache = Webcache
             webcache.new(res)
             self.res = res
         self.write_json()
     except Exception as e:
         print(e)
    def getAutoDetail(cls,
                      contentPageNumber,
                      html,
                      enableDownloadImage=False,
                      enableSnapshot=False):
        autoDetail = {}
        try:

            doc = Document(html)
            # response.
            if contentPageNumber <= 1:
                autoDetail["title"] = ArticleUtils.cleanHeadTitle(doc.title())
                # autoDetail["publishAt"] = TimeUtils.get_conent_time(html)
                # autoDetail["html"] = html
            contentSnapshot = doc.summary()
            if StringUtils.isNotEmpty(
                    ArticleUtils.removeAllTag(contentSnapshot)):
                if enableSnapshot:
                    autoDetail["contentSnapshot"] = contentSnapshot.replace(
                        "<html>",
                        "").replace("</html>",
                                    "").replace("<body>",
                                                "").replace("</body>", "")
                autoDetail["content"] = ArticleUtils.removeTag4Content(
                    contentSnapshot)
                if enableDownloadImage:
                    autoDetail[
                        "contentImages"] = ArticleUtils.get_content_image_urls(
                            contentSnapshot, response.url)
        except Exception as e:
            return autoDetail
        return autoDetail
Example #7
0
 def get(self):
     url = self.get_argument("url", None)
     # https://www.ifanr.com/1080409
     doc = Webcache.find_one({'url': url}, {'_id': 0})
     if doc:
         self.res = dict(doc)
         return self.write_json()
     try:
         sessions = requests.session()
         sessions.headers[
             'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
         response = sessions.get(url)
         # response.encoding = 'utf-8'  # TODO
         response.encoding = get_charset(response)
         doc = Document(response.text)
         title = doc.title()
         summary = doc.summary()
         markdown = html2text.html2text(summary)
         markdown = markdown.replace('-\n', '-')
         markdown = markdown.strip()
         res = {}
         res['url'] = url
         res['title'] = title
         res['markdown'] = markdown
         if title and markdown:
             webcache = Webcache
             webcache.new(res)
             self.res = res
         self.write_json()
     except Exception as e:
         print(e)
Example #8
0
 def getAutoDetail(cls,
                   response,
                   enableDownloadImage=False,
                   enableSnapshot=False,
                   isFirstPage=True):
     autoDetail = {}
     try:
         html = "".join(response.xpath("//html").extract())
         doc = Document(html)
         if isFirstPage:
             autoDetail["title"] = doc.title()
             autoDetail["publishAt"] = TimeUtils.get_conent_time(html)
         contentSnapshot = doc.summary()
         if enableSnapshot:
             autoDetail["contentSnapshot"] = contentSnapshot.replace(
                 "<html>",
                 "").replace("</html>",
                             "").replace("<body>",
                                         "").replace("</body>", "")
         autoDetail["content"] = ArticleUtils.removeTag4Content(
             contentSnapshot)
         if enableDownloadImage:
             autoDetail[
                 "contentImages"] = ArticleUtils.get_content_image_urls(
                     contentSnapshot, response.url)
     except Exception as e:
         return autoDetail
     return autoDetail
Example #9
0
def get_text(url):
    response = get(url)
    doc = Document(response.text)
    title = doc.title()
    summary = doc.summary()
    body = bs4.BeautifulSoup(summary, features="lxml").get_text()
    return f"{title} : {body}"
Example #10
0
def extract():
	
	url = request.form['site']

	response = requests.get(url)
	doc = Document(response.text)

	parser = html2text.HTML2Text()
	parser.ignore_links = True
	parser.ignore_images = True
	parser.ignore_emphasis = True
	parser.ignore_anchors = True
	parser.ignore_tables = True

	title = doc.title()
	title = re.sub(r' *- [-a-zA-Z0-9 @:%._\+~#=]{1,256}', '', title)														#	<---- this crops everything after -[space]
	title = re.sub(r' *- *[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', title)	#	<---- this is for websites only
	article = parser.handle(str(doc.summary()))
	article = fix_article(article)
	article = article.split("<br/>")

	if url != '':
		insert_in_db(analytics_client, container, url, title, article)
		return flask.render_template('index.html', title=title, data=article)
	else:
		error_msg = "Enter a valid URL"
		return flask.render_template('index.html', error=error_msg)
Example #11
0
def parse(html):
    doc = Document(html)
    title = doc.title()
    if title == u'[no-title]':
        title = u''
    content_html = doc.summary()
    content_html = content_html.replace(u'<html>', u'').replace(u'</html>', u'')\
        .replace(u'<body>', u'').replace(u'</body>', u'')

    clear_paths = [u'//script', u'//img', u'//a']
    body = clearDOM(html, clear_paths)

    match_list = findTimeStr(body)
    post_date = u''
    for match_item in match_list:
        if len(match_item) > len(post_date):
            post_date = match_item

    style_in_list = []
    style_need_replace = []

    content_item = {
        u'title': title,
        u'content_html': content_html,
        u'post_date': post_date,
        u'style_in_list': style_in_list,
        u'style_need_replace': style_need_replace
    }
    return content_item
Example #12
0
def get_article(url):
    try:
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
        response = requests.get(
            url, headers=headers, timeout=4
        )  # can throw BaseException if the server does not respond, or TooManyRedirects
    except BaseException:
        return ''

    readability_doc = Document(response.text)

    # doc.summary() is not really a summary just the main part of the
    # website's content
    html_content = readability_doc.summary()
    title = readability_doc.title()
    soup = BeautifulSoup(html_content, 'html.parser')
    # Replace all excesive whitespace and new line
    content = re.sub(r"(\s{2,})|(\n{2,})",
                     "\n",
                     soup.get_text(),
                     flags=re.UNICODE)
    if " - " in title:
        title = re.sub(r" - [\s\w]+$", "", title)

    return content, title
def analyze(request):
    'API text analyze view'
    if request.method == 'POST':
        text = request.body.decode('utf-8')
        try:
            text = json.loads(text)['text']
        except ValueError:
            # catch POST form as well
            for key in request.POST.dict().keys():
                text = key

        if settings.ALLOW_URL_IMPORTS and text.startswith(('http://', 'https://', 'www')):
            page = requests.get(text)
            doc = Document(page.text)
            soup = BeautifulSoup(doc.summary())
            text = soup.get_text()
            title = doc.title().strip()
            text = '{0}.\n{1}'.format(title, text)

        if not text:
            response = JsonResponse(
                {'status': 'false', 'message': 'need some text here!'})
            response.status_code = 400
            return response

        # add some limit here
        text = text[:200000]
        ret = {}
        ret = analyze_text(text)
        return JsonResponse(ret)
    else:
        ret = {'methods_allowed': 'POST'}
        return JsonResponse(ret)
def feedtheURLs(url, fileName):
    # f**k you sina
    if ("sina" in str(url)):
        content = urlopen(url).read()
        str_start = '<!--博文正文 begin -->'
        str_end = '<!-- 正文结束 -->'
        start = content.find(str_start)
        end = content.find(str_end)
        con1 = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n'
        con2 = content[start:end]
        body = con1 + con2
        doc = Document(body)
    else:
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()

        except Exception:
            return
        doc = Document(response.text)

    try:
        # f**k sina
        if ("sina" in str(url)):
            title = str(re.findall("<h2.*?\/h2>", body)[0]).decode('utf-8')
        else:
            title = doc.title()
        # content = doc.content()
        summary = doc.summary()
        title = strip_tags(title)
        # content = strip_tags(content)
        summary = strip_tags(summary)
        # print (title)
        # sys.exit()
    except Exception:
        return

    with open("Crawler_Output/{}.txt".format(fileName), "a") as my_file:
        if (re.search(u'[\u4e00-\u9fff]', summary)):
            my_file.write("标题:" + title.encode('utf-8'))
            my_file.write("\n链接:" + url)
            my_file.write("文章内容:\n" + summary.encode('utf-8'))
        else:
            my_file.write("标题:" + "该链接是无效链接")
            my_file.write("\n链接:" + url)
            my_file.write("文章内容:\n" + "该文章已经被删除或网络请求错误")

    clean_lines = []
    with open("Crawler_Output/{}.txt".format(fileName), "r") as f:
        lines = f.readlines()
        clean_lines = [l.strip() for l in lines if l.strip()]

    with open("Crawler_Output/{}.txt".format(fileName), "w") as f:
        # 		f.write('''
        # ========================================================
        # 			''')
        f.writelines('\n'.join(clean_lines))
        f.write('''
========================================================
			''')
Example #15
0
def extract_core_html(html: str):
    """从文章类型提取核心HTML

    Args:
        html (str): raw html
    """
    doc = Document(html)
    return doc.title(), doc.summary()
Example #16
0
 def process_one(self, content):
     try:
         doc = Document(content)
         return ContentResult(doc.title(), doc.summary())
     except Exception as e:
         logger.error(
             f"Readability failed on {content.title} with error {e}")
         return ContentResult('', '')
Example #17
0
def extract(html):
    """
    Simply uses regex and readability to get document body from html
    """

    doc = Document(html)
    json_ret = {"title": tag_re.sub('', doc.title()),
    "body": tag_re.sub('', doc.summary()).replace("\n", " " ).replace("\xa0", " ")}
    return json_ret
Example #18
0
    def transform(self, row, chan):
        row['response'] = resolve_future(row['response'])

        doc = Document(row['response'].content)

        row['title'] = doc.title()
        summary = doc.summary()
        row['text'] = html2text(summary, bodywidth=160).replace('****', '').strip()

        yield row
Example #19
0
def extract_title_and_summary(content):
    doc = Document(content)
    title = doc.title()
    try:
        lang = detect(title)
    except LangDetectException:
        lang = 'unknown'
    s = TAG_RE.sub('', doc.summary())
    s = ' '.join([x for x in s.split() if x.strip() != ''])
    return title, lang, s
Example #20
0
def get_manchete(k):
    soup = BeautifulSoup(k, 'lxml')
    #manchete = soup.findAll('h1',{'class':'content-head__title'})
    manchete = soup.findAll('h1',{'property':'na:headline'})
    try:    
        manchete_ok = manchete[0].text
    except IndexError:   
        page_content = Document(k)
        manchete_ok = page_content.title()
    return(manchete_ok)
 def parse(self,file_path):
     with open(file_path) as fin:
         content = fin.read()
     doc = Document(content)
     title = doc.title()
     article = doc.summary()
     readable_article=self.strip(article)
     readable_title=self.strip(title)
     #print readable_article
     #print readable_title
     return readable_title + " " + readable_article
Example #22
0
def extract_article(url, ip):
    """Extracts the article using readability"""
    title, summary = None, None
    response = get_url(url, ip)
    if response.status_code == 200:
        doc = Document(response.content)
        summary = unicode(doc.summary())
        title = unicode(doc.title())
        return title, summary
    else:
        return None
Example #23
0
    def parallelizable_request(self, entry):
        req = requests.get(entry["link"])
        if not req.ok:
            print(f"Honk! Couldn't grab content for {self.feed_url}")
            return None

        doc = Document(req.content)
        source = entry["link"].split(".")[1]
        story = Story(doc.title(), body_html=doc.summary(), byline=source)

        return story
Example #24
0
def compare(request):
    'API compare documents view'
    doc_dicts = []
    if request.method == 'POST':
        text = request.body.decode('utf-8')
        try:
            text = json.loads(text)['text']
        except ValueError:
            # catch POST form as well
            for key in request.POST.dict().keys():
                text = key

        if settings.ALLOW_URL_IMPORTS and text.startswith(('http://', 'https://', 'www')):
            lines = text_to_list(text)
            i = 0
            for line in lines[:2]:
                if not line.startswith(('http://', 'https://', 'www')):
                    response = JsonResponse({'status': 'false', 'message': 'need at least 2 urls!'})
                    response.status_code = 400
                    return response
                page = requests.get(line)
                doc = Document(page.text)
                soup = BeautifulSoup(doc.summary())
                text = soup.get_text()
                title = doc.title().strip()
                text = '{0}.\n{1}'.format(title, text)
                if not text:
                    response = JsonResponse({'status': 'false', 'message': 'need some text here!'})
                    response.status_code = 400
                    return response
    
                # add some limit here
                text = text[:200000]
                doc = text_to_doc(text)
                language = doc.lang_
                if i>0 and language!=doc_dicts[0]['language']:
                    response = JsonResponse(
                        {'status': 'false', 'message': 'texts must be in same language!'})
                    response.status_code = 400
                    return response
                    
                doc_dicts.append({'language': language, 'doc': doc})
                i += 1
            ret = compare_docs(doc_dicts)
            ret['language'] = language
            ret['text'] = text
            return JsonResponse(ret)
        else:
            response = JsonResponse({'status': 'false', 'message': 'need 2 documents!'})
            response.status_code = 400
            return response

    else:
        return JsonResponse({'methods_allowed': 'POST'})
Example #25
0
def get_manchete(k):
    #soup = BeautifulSoup(k, 'lxml')
    soup = k
    #manchete = soup.findAll('h1',{'class':'content-head__title'})
    manchete = soup.findAll('h1',{'class':'articulo-titulo'})
    try:      
        manchete_ok = manchete[0].text
    except IndexError:   
        page_content = Document(k)
        manchete_ok = page_content.title()
    return(manchete_ok)
Example #26
0
def extract_article(html_content, language="zh", holding_url="http://127.0.0.1/"):
    doc = Document(html_content)
    title = doc.title()
    html = doc.summary(True)

    # return title, html

    article = Article(url=holding_url, language=language)
    article.download(input_html=html)
    article.parse()

    return title, article.text
Example #27
0
def get_article_content(html_content, title=None):
    doc = Document(html_content)
    # 'html_content' is used for display
    # and existing classes are removed
    html_content = re.sub('class=".*?"', '', doc.summary(html_partial=False))
    # 'content' is used for search
    content = BeautifulSoup(html_content, "html.parser").text
    return {
        'title': title if title else doc.title(),
        'content': content,
        'html_content': html_content,
    }
Example #28
0
    def populate_from_html(self, html):

        doc = Document(html)

        title = doc.title()
        body = doc.summary(html_partial=True)
        body_plain_text = Item.strip_tags(body)

        self.source_response_raw = html
        self.title = title
        self.body = body
        self.body_plain_text = body_plain_text
Example #29
0
File: view.py Project: edd07/resh
def view_html(url):
    """Converts an html document to a markdown'd string
    using my own fork of python-readability"""
    try:
        from readability import Document
    except ImportError:
        print("Can't convert document: python-readability is not installed")
        return
    
    html = urlopen(url).read()
    doc=Document(html)
    print(wrap(asciify(BOLD+doc.title()+RESET+"\n"+doc.markdown(),strip_newlines=False),80,''))
Example #30
0
def abc(url):
    #url = "https://github.com/codelucas/newspaper"
    r= requests.get(url)
    r.encoding = "utf-8"
    #html = r.text
    doc = Document(r.text)
    print(doc.title())
    print(doc.summary())
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.bypass_tables = False
    text = text_maker.handle(doc.summary())
    print(text)
Example #31
0
def saveHTML(url, img=True, prefix=''):
    if img:
        text = generate(url)
    else:
        response = requests.get(url)
        text = response.text
    doc = Document(text)
    content = fixHeader(doc.summary())

    filename = os.path.join(prefix, doc.title() + '.html')
    with open(filename, 'w') as f:
        f.write(content)
    return filename
Example #32
0
def translate(link):
    global url
    global text_nodes
    global text_strings
    global markup

    dest = "en"
    url = link
    parser = HtmlParser()
    response = requests.get(url)
    doc = Document(response.text)
    # tree = fromstring(r.content)
    title = doc.title()  # tree.findtext('.//title')
    lang = translator.detect(title).lang
    if lang == 'en':
        # print("The article appears to be in English already.")
        return 'null'
    title = translator.translate(title).text
    content = doc.summary()
    # print(content)
    soup = bs(content, 'lxml')
    text = str(soup.find('body'))
    # text = r.text.split('<body')[1].split('</body>')[0]
    repls = ('h1>', 'h3>'), ('h2>', 'h3>'), ('<h1', '<h3'), ('<h2', '<h3')
    text = reduce(lambda a, kv: a.replace(*kv), repls, text)
    text = emoji.get_emoji_regexp().sub(r'', text)  # removing the emojis
    # print(text)
    parser.feed(text)
    # print("text_nodes: ", text_nodes)
    # print(text_strings)
    # print(text)
    # print(markup)
    # print("STARTING TO TRANSLATE...", url)
    translations = translator.translate(text_strings, dest=str(dest))
    final_payload = []
    for translation in translations:
        scheme = [translation.text]
        # print(translation.origin, ' -> ', scheme[0])
        final_payload.extend(scheme)
    markup = markup.format(*final_payload)
    markup = re.sub(r'\s([?.!"](?:\s|$))', r'\1', markup)
    print("\n")
    # print(markup)
    access_token = os.environ.get("access_token")
    t = TelegraphPoster(
        access_token=access_token)
    article = t.post(title=str(title), author='lulz', text=str(markup))
    x = str(article).replace("'", '"')
    article = json.loads(x)
    text = "Your article is ready to read! {}".format(article['url'])
    return text
Example #33
0
    def getContent(self, url, cate):
        #         print("Fetching "+url+" ...")
        try:
            page_source = urllib.request.urlopen(url)
            self.crawledList.append(url)
            html = page_source.read().decode("utf8")
            doc = Document(html)
            bsObj = BeautifulSoup(doc.summary(), "html.parser")

            title = str(doc.title())
            # content =
            # bsObj.find('div', attrs={'class':'story-body__inner'})
            #Process Text

            text = str(bsObj.text)
            text = text.replace("Image copyright", "")

            text = text.replace("  ", "")
            text = text.replace("\n\n", "")
            text = text.replace("- BBC News", "")
            #Save text
            #Category
            if (cate is 'business'):
                self.category.append('business')
                self.bu += 1
            if (cate is 'entertainment'):
                self.category.append('entertainment')
                self.en += 1
            if (cate is 'politics'):
                self.category.append('politics')
                self.po += 1
            if (cate is 'sport'):
                self.category.append('sport')
                self.sp += 1
            if (cate is 'tech'):
                self.category.append('tech')
                self.te += 1
            #Filename
            filename = url[url.rfind("/") + 1:]
            self.filename.append(filename)
            #Title
            self.title.append(title)
            #content
            self.content.append(text)
            self.getLinkList(url)
            self.saveToTxt(title + "\n\n" + text, cate)
            self.sucess += 1
            return 1
        except Exception as error:
            print(error)
Example #34
0
    def parse(self, response: scrapy.http.Response, **kwargs):
        """
        Page parser.
        :param response: Page and response object.
        :param kwargs: A dict of parameters.
        :return: None
        """

        content_type: bytes = response.headers.get('Content-Type')
        if not content_type.startswith(b'text/html'):
            return None

        # Note: response.headers is a caseless dict.
        this_page = PageItem()
        article = Document(response.text, handle_failures=None)
        this_page['link_count'] = len(response.css('a[href]'))
        this_page['title'] = article.title()
        this_page['url'] = response.url
        this_page['content'] = article.summary()

        # Submit the this_page to pipeline.
        yield this_page

        # Get other links from the page and append them to url list
        link_list = get_links(response)
        for title, url in link_list:
            title = title.strip() if title else ''
            url = response.urljoin(url)
            if '.sit.edu.cn' not in url:
                continue
            """
            Separate pages from attachments.
            We may fetch the url and see what server say in 'Content-Type' but it can't be done in parse
            function. Actually, it's the simplest way to distinguish pages and attachments without fetching. 
            """
            _, path = divide_url(url)
            link_type = guess_link_type(path)
            if link_type == 'page':
                # Fetch next page
                yield scrapy.Request(url=url, callback=self.parse)

            elif link_type == 'attachment':  # link_type may equal to 'attachment'
                item = AttachmentItem()

                item['referer'] = response.url  # Url of current web page
                item['url'] = url  # Url of attachment
                item['title'] = title.replace('\xa0', '').replace(
                    ' ', '')  # Take file title from last page.
                yield item
Example #35
0
def extract_content_texts(name):
    article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles')
    json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles')
    mkdir_p(json_archive)
    for html in glob.glob(article_archive+'/*.html'):
        fname = os.path.basename(html)+'.json'
        savepath = os.path.join(json_archive, fname)
        if os.path.exists(savepath):
            logging.info('Skipping existing json data: {0}'.format(savepath))
            continue
        data = {}
        with open(html, 'r') as myfile:
            doc = Document(myfile.read())
            data['title'] = doc.title()
            data['content'] = doc.content()
            data['summary'] = doc.summary()
            with open(savepath, 'w') as saving:
                json.dump(data, saving)
Example #36
0
 def get(self):
     sharetype = self.get_argument("sharetype", "goodlink")
     link = self.get_argument("link", '')
     user_id = self.current_user["user_id"]
     assert link
     url = link
     doc = Webcache.find_one({'url': url}, {'_id': 0})
     if not doc:
         sessions = requests.session()
         sessions.headers[
             'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
         response = sessions.get(url)
         # response.encoding = 'utf-8'  # TODO
         response.encoding = get_charset(response)
         logger.info('response.encoding {}'.format(response.encoding))
         doc = Document(response.text)
         doc_title = doc.title()
         summary = doc.summary()
         _markdown = html2text.html2text(summary)
         _markdown = _markdown.replace('-\n', '-').strip()
         res_webcache = {}
         res_webcache['url'] = url
         res_webcache['title'] = doc_title
         res_webcache['markdown'] = _markdown
         if _markdown:
             webcache = Webcache
             webcache.new(res_webcache)
     else:
         logger.info('already')
         doc_title = doc.title
     res = {
         'title': doc_title,
         'sharetype': sharetype,
         'link': link,
     }
     share = Share
     res['user_id'] = user_id
     share = share.new(res)
     user = User.by_sid(user_id)
     user.user_leaf += 10
     user.save()
     self.redirect("/share/" + str(share.id))