Python clean_htmlの例、clean_html.clean_html Pythonの例

コード例 #1

0

ファイルを表示

ファイル: app.py プロジェクト: TobiasYin/sixdu

def api_index():
    secret_item = Secret.query.order_by(db.desc(Secret.id)).first()
    article_item = Article.query.order_by(db.desc(Article.id)).first()
    essay_item = Essays.query.order_by(db.desc(Essays.id)).first()
    ArticleItem = {
        'title': article_item.title,
        'id': article_item.id,
        'author': article_item.author.username,
        'authorID': article_item.author.id,
        'content': clean_html(article_item.content)[0:200],
        'like': len(article_item.like),
        'time': article_item.time,
    }
    EssayItem = {
        'author': essay_item.author.username,
        'authorID': essay_item.author.id,
        'content': essay_item.content,
        'like': len(essay_item.like),
        'time': essay_item.time,
        'id': essay_item.id
    }
    SecretItem = {
        'id': secret_item.id,
        'title': secret_item.title,
        'content': clean_html(secret_item.content)[0:200],
        'like': len(secret_item.like),
        'time': secret_item.time,
    }
    data = {
        'SecretItem': SecretItem,
        'ArticleItem': ArticleItem,
        'EssayItem': EssayItem
    }
    return jsonify(data)

コード例 #2

0

ファイルを表示

ファイル: app.py プロジェクト: TobiasYin/sixdu

def api_user_articles(user_id):
    temp_user = User.query.filter(User.id == user_id).first()
    articles = temp_user.articles
    params = request.values.to_dict()
    length = len(articles)
    article_list = []
    if length:
        if length > 10:
            if not len(params):
                params = {'start': 1, 'end': 10}
            noMore = False
        else:
            params = {'start': 1, 'end': length}
            noMore = True
        params['start'] = int(params['start'])
        params['end'] = int(params['end'])
        if length >= params['start']:
            if length > params['end']:
                for item in articles[::-1][params['start'] - 1:params['end']]:
                    temp = {
                        'author': item.author.username,
                        'title': item.title,
                        'id': item.id,
                        'content': clean_html(item.content)[0:200],
                        'like': len(item.like),
                        'time': item.time,
                        'authorID': temp_user.id
                    }
                    article_list.append(temp)
            else:
                for item in articles[::-1][params['start'] - 1:length]:
                    temp = {
                        'author': item.author.username,
                        'title': item.title,
                        'id': item.id,
                        'content': clean_html(item.content)[0:200],
                        'like': len(item.like),
                        'time': item.time,
                        'authorID': temp_user.id
                    }
                    article_list.append(temp)
                noMore = True
        else:
            noMore = True
    else:
        noMore = True
    data = {
        'ArticleList': article_list,
        'noMore': noMore,
        'username': temp_user.username
    }
    return jsonify(data)

コード例 #3

0

ファイルを表示

ファイル: grabbersources.py プロジェクト: inductiveload/pygrabber

    def extract_page_text(self, searchTree):
        retrieved_ocr = None
        for a in searchTree.cssselect('div#modeTexte > div'):
            retrieved_ocr = tostring(a)
            retrieved_ocr = clean_html.clean_html(retrieved_ocr, newline_at_br=True)

        return retrieved_ocr

コード例 #4

0

ファイルを表示

ファイル: app.py プロジェクト: TobiasYin/sixdu

def api_secrets():
    secrets = Secret.query.all()
    params = request.values.to_dict()
    length = len(secrets)
    secrets_list = []
    if length:
        if length > 10:
            if not len(params):
                params = {'start': 1, 'end': 10}
            noMore = False
        else:
            params = {'start': 1, 'end': length}
            noMore = True
        params['start'] = int(params['start'])
        params['end'] = int(params['end'])
        if length >= params['start']:
            if length > params['end']:
                for item in secrets[::-1][params['start'] - 1:params['end'] -
                                          1]:
                    temp = {
                        'title': item.title,
                        'id': item.id,
                        'content': clean_html(item.content)[0:200],
                        'like': len(item.like),
                        'time': item.time,
                    }
                    secrets_list.append(temp)
            else:
                for item in secrets[::-1][params['start'] - 1:length]:
                    temp = {
                        'title': item.title,
                        'id': item.id,
                        'content': clean_html(item.content)[0:200],
                        'like': len(item.like),
                        'time': item.time,
                    }
                    secrets_list.append(temp)
                noMore = True
        else:
            noMore = True
    else:
        noMore = True
    data = {
        'ArticleList': secrets_list,
        'noMore': noMore,
    }
    return jsonify(data)

コード例 #5

0

ファイルを表示

ファイル: models_rssfeed.py プロジェクト: fossabot/FeedCrunch.IO

    def save(self, *args, **kwargs):
        if self.id is not None:
            self.title = clean_html(self.title)

            old_version = RSSFeed.objects.get(id=self.id)
            if (self.link != old_version.link) and (not validate_feed(self.link)):
                raise Exception("RSS Feed is not valid")

        super(RSSFeed, self).save(*args, **kwargs) # Call the "real" save() method.

コード例 #6

0

ファイルを表示

    def create(self, *args, **kwargs):

        if 'title' in kwargs and (isinstance(kwargs['title'], str)
                                  or isinstance(kwargs['title'], str)):
            kwargs['title'] = clean_html(kwargs['title'])
        else:
            raise Exception("Title is missing - RSSFeed_Sub Manager")

        if RSSFeed_Sub.objects.filter(user=kwargs['user'],
                                      feed=kwargs['feed']).exists():
            raise Exception("User (" + kwargs['user'].username +
                            ") is already subscribed to the RSSFeed (id = " +
                            str(kwargs['feed'].id) + ")")

        return super(RSSFeed_SubManager, self).create(*args, **kwargs)

コード例 #7

0

ファイルを表示

ファイル: models_rssfeed.py プロジェクト: fossabot/FeedCrunch.IO

    def create(self, *args, **kwargs):

        if 'title' in kwargs and (isinstance(kwargs['title'], str) or isinstance(kwargs['title'], str)):
            kwargs['title'] = clean_html(kwargs['title'])
        else:
            raise Exception("Title is missing - RSSFeed Manager")

        if 'link' in kwargs and (isinstance(kwargs['link'], str) or isinstance(kwargs['link'], str)):

            if RSSFeed.objects.filter(link=kwargs['link']).exists():
                raise Exception("RSSFeed already exists in the database.")

            elif not validate_feed(kwargs['link']):
                raise Exception("RSS Feed is not valid")

        else:
            raise Exception("Link is missing - RSSFeed Manager")

        return super(RSSFeedManager, self).create(*args, **kwargs)

コード例 #8

0

ファイルを表示

ファイル: app.py プロジェクト: TobiasYin/sixdu

def api_user(user_id):
    temp_user = User.query.filter(User.id == user_id).first()
    if temp_user.articles:
        last_article = temp_user.articles[-1]
        last_article_data = {
            'title': last_article.title,
            'author': temp_user.username,
            'content': clean_html(last_article.content)[0:200],
            'like': len(last_article.like),
            'time': last_article.time,
            'id': last_article.id,
            'authorID': temp_user.id
        }
    else:
        last_article_data = False
    if temp_user.essays:
        last_essay = temp_user.essays[-1]
        last_essay_data = {
            'author': temp_user.username,
            'content': last_essay.content,
            'like': len(last_essay.like),
            'time': last_essay.time,
            'authorID': temp_user.id,
            'id': last_essay.id
        }
    else:
        last_essay_data = False
    if temp_user.self_introduction:
        data = {
            'username': temp_user.username,
            'selfIntro': temp_user.self_introduction,
            'lastArticle': last_article_data,
            'lastEssay': last_essay_data
        }
    else:
        data = {
            'username': temp_user.username,
            'selfIntro': '该用户来去无风，没有留下过签名~ ~ ~ ',
            'lastArticle': last_article_data,
            'lastEssay': last_essay_data
        }
    return jsonify(data)

コード例 #9

0

ファイルを表示

    def create(self, *args, **kwargs):

        if 'title' in kwargs and (isinstance(kwargs['title'], str)
                                  or isinstance(kwargs['title'], str)):
            kwargs['title'] = clean_html(kwargs['title'])
        else:
            raise Exception("Title is missing - RSSArticle Manager")

        if 'link' in kwargs and (isinstance(kwargs['link'], str)
                                 or isinstance(kwargs['link'], str)):
            if RSSArticle.objects.filter(rssfeed=kwargs['rssfeed'],
                                         link=kwargs['link']).exists():
                raise Exception(
                    "RSS Article already exists in database - RSSArticle Manager"
                )

        else:
            raise Exception("Link is missing - RSSFeed Manager")

        return super(RSSArticleManager, self).create(*args, **kwargs)

コード例 #10

0

ファイルを表示

# get_all_keywords用到的包
import jieba
from get_word2vec import get_word2vec
from pdfminer2txt import get_pdf2txt
import catchkeywords
import numpy as np
import re
import clean_html
import get_all_keywords

filename = 'test'
filepath = 'test.pdf'
stopwords_path = 'word2vec_format.txt'
html_file = 'pdf2html/test.html'
usr_keyword = ['计算机', '人工智能']
html_doc = clean_html.clean_html()

# 导入关键词列表，以备遍历
final_keywords = get_all_keywords.get_final_keywords(filepath, stopwords_path,
                                                     usr_keyword)


def indexstr(str1, str2):
    # 查找指定字符串str1包含指定子字符串str2的全部位置，以列表形式返回
    lenth2 = len(str2)
    lenth1 = len(str1)
    indexstr2 = []
    i = 0
    while str2 in str1[i:]:
        indextmp = str1.index(str2, i, lenth1)
        indexstr2.append(indextmp)

コード例 #11

0

ファイルを表示

ファイル: extension_data.py プロジェクト: dr-jgsmith/Extension-Knowledge-Network

    b = len(a) - 2
    c = a[b:]
    d = ''.join(c)
    if d == "//":
        b = len(a) - 1
        c = a[:b]
        print(''.join(c))
        clean_links.append(''.join(c))
    else:
        print(i)
        clean_links.append(i)

#Since this output cleans all links, we can end up with a very large list.
#So we will isolate only those links that include our target domain.
e_links = []
for i in clean_links:
    if 'extension.org' in i:
        e_links.append(i)
    else:
        pass
#Create csv document for later analysis
for i in sorted(set(e_links)):
    link = i
    text = x.get_text_from_link(
        link)  #this method does exactly what it is called.
    clean = clean_html(text)  #Strip the html from the returned document
    row = [link, clean]
    file.writerow(row)  #Write this data
    t = randrange(5)
    time.sleep(t)

コード例 #12

0

ファイルを表示

 def save(self, *args, **kwargs):
     if self.id is not None:
         self.title = clean_html(self.title)
     super(RSSArticle,
           self).save(*args, **kwargs)  # Call the "real" save() method.

コード例 #13

0

ファイルを表示

ファイル: textprocess.py プロジェクト: kyoma95/ClueWeb09-scripts

def get_sentences_from_html_v2(html, nlp=None):
    paras = clean_html(html)
    sentences = list(s for s in nlp_analyze(paras, nlp))

    return sentences

コード例 #14

0

ファイルを表示

ファイル: extension_data.py プロジェクト: dr-jgsmith/Extension-Knowledge-Network

    a = list(i)
    b = len(a) - 2
    c = a[b:]
    d = ''.join(c)
    if d == "//":
        b = len(a) - 1
        c = a[:b]
        print(''.join(c))
        clean_links.append(''.join(c))
    else:
        print(i)
        clean_links.append(i)

#Since this output cleans all links, we can end up with a very large list. 
#So we will isolate only those links that include our target domain.
e_links = []
for i in clean_links:
    if 'extension.org' in i:
        e_links.append(i)
    else:
        pass
#Create csv document for later analysis      
for i in sorted(set(e_links)):
    link = i
    text = x.get_text_from_link(link) #this method does exactly what it is called.
    clean = clean_html(text) #Strip the html from the returned document
    row = [link, clean]
    file.writerow(row) #Write this data
    t = randrange(5)
    time.sleep(t)

コード例 #15

0

ファイルを表示

ファイル: textprocess.py プロジェクト: zxteloiv/ClueWeb09-scripts

def get_sentences_from_html_v2(html, nlp=None):
    paras = clean_html(html)
    sentences = list(s for s in nlp_analyze(paras, nlp))

    return sentences