def api_index(): secret_item = Secret.query.order_by(db.desc(Secret.id)).first() article_item = Article.query.order_by(db.desc(Article.id)).first() essay_item = Essays.query.order_by(db.desc(Essays.id)).first() ArticleItem = { 'title': article_item.title, 'id': article_item.id, 'author': article_item.author.username, 'authorID': article_item.author.id, 'content': clean_html(article_item.content)[0:200], 'like': len(article_item.like), 'time': article_item.time, } EssayItem = { 'author': essay_item.author.username, 'authorID': essay_item.author.id, 'content': essay_item.content, 'like': len(essay_item.like), 'time': essay_item.time, 'id': essay_item.id } SecretItem = { 'id': secret_item.id, 'title': secret_item.title, 'content': clean_html(secret_item.content)[0:200], 'like': len(secret_item.like), 'time': secret_item.time, } data = { 'SecretItem': SecretItem, 'ArticleItem': ArticleItem, 'EssayItem': EssayItem } return jsonify(data)
def api_user_articles(user_id): temp_user = User.query.filter(User.id == user_id).first() articles = temp_user.articles params = request.values.to_dict() length = len(articles) article_list = [] if length: if length > 10: if not len(params): params = {'start': 1, 'end': 10} noMore = False else: params = {'start': 1, 'end': length} noMore = True params['start'] = int(params['start']) params['end'] = int(params['end']) if length >= params['start']: if length > params['end']: for item in articles[::-1][params['start'] - 1:params['end']]: temp = { 'author': item.author.username, 'title': item.title, 'id': item.id, 'content': clean_html(item.content)[0:200], 'like': len(item.like), 'time': item.time, 'authorID': temp_user.id } article_list.append(temp) else: for item in articles[::-1][params['start'] - 1:length]: temp = { 'author': item.author.username, 'title': item.title, 'id': item.id, 'content': clean_html(item.content)[0:200], 'like': len(item.like), 'time': item.time, 'authorID': temp_user.id } article_list.append(temp) noMore = True else: noMore = True else: noMore = True data = { 'ArticleList': article_list, 'noMore': noMore, 'username': temp_user.username } return jsonify(data)
def extract_page_text(self, searchTree): retrieved_ocr = None for a in searchTree.cssselect('div#modeTexte > div'): retrieved_ocr = tostring(a) retrieved_ocr = clean_html.clean_html(retrieved_ocr, newline_at_br=True) return retrieved_ocr
def api_secrets(): secrets = Secret.query.all() params = request.values.to_dict() length = len(secrets) secrets_list = [] if length: if length > 10: if not len(params): params = {'start': 1, 'end': 10} noMore = False else: params = {'start': 1, 'end': length} noMore = True params['start'] = int(params['start']) params['end'] = int(params['end']) if length >= params['start']: if length > params['end']: for item in secrets[::-1][params['start'] - 1:params['end'] - 1]: temp = { 'title': item.title, 'id': item.id, 'content': clean_html(item.content)[0:200], 'like': len(item.like), 'time': item.time, } secrets_list.append(temp) else: for item in secrets[::-1][params['start'] - 1:length]: temp = { 'title': item.title, 'id': item.id, 'content': clean_html(item.content)[0:200], 'like': len(item.like), 'time': item.time, } secrets_list.append(temp) noMore = True else: noMore = True else: noMore = True data = { 'ArticleList': secrets_list, 'noMore': noMore, } return jsonify(data)
def save(self, *args, **kwargs): if self.id is not None: self.title = clean_html(self.title) old_version = RSSFeed.objects.get(id=self.id) if (self.link != old_version.link) and (not validate_feed(self.link)): raise Exception("RSS Feed is not valid") super(RSSFeed, self).save(*args, **kwargs) # Call the "real" save() method.
def create(self, *args, **kwargs): if 'title' in kwargs and (isinstance(kwargs['title'], str) or isinstance(kwargs['title'], str)): kwargs['title'] = clean_html(kwargs['title']) else: raise Exception("Title is missing - RSSFeed_Sub Manager") if RSSFeed_Sub.objects.filter(user=kwargs['user'], feed=kwargs['feed']).exists(): raise Exception("User (" + kwargs['user'].username + ") is already subscribed to the RSSFeed (id = " + str(kwargs['feed'].id) + ")") return super(RSSFeed_SubManager, self).create(*args, **kwargs)
def create(self, *args, **kwargs): if 'title' in kwargs and (isinstance(kwargs['title'], str) or isinstance(kwargs['title'], str)): kwargs['title'] = clean_html(kwargs['title']) else: raise Exception("Title is missing - RSSFeed Manager") if 'link' in kwargs and (isinstance(kwargs['link'], str) or isinstance(kwargs['link'], str)): if RSSFeed.objects.filter(link=kwargs['link']).exists(): raise Exception("RSSFeed already exists in the database.") elif not validate_feed(kwargs['link']): raise Exception("RSS Feed is not valid") else: raise Exception("Link is missing - RSSFeed Manager") return super(RSSFeedManager, self).create(*args, **kwargs)
def api_user(user_id): temp_user = User.query.filter(User.id == user_id).first() if temp_user.articles: last_article = temp_user.articles[-1] last_article_data = { 'title': last_article.title, 'author': temp_user.username, 'content': clean_html(last_article.content)[0:200], 'like': len(last_article.like), 'time': last_article.time, 'id': last_article.id, 'authorID': temp_user.id } else: last_article_data = False if temp_user.essays: last_essay = temp_user.essays[-1] last_essay_data = { 'author': temp_user.username, 'content': last_essay.content, 'like': len(last_essay.like), 'time': last_essay.time, 'authorID': temp_user.id, 'id': last_essay.id } else: last_essay_data = False if temp_user.self_introduction: data = { 'username': temp_user.username, 'selfIntro': temp_user.self_introduction, 'lastArticle': last_article_data, 'lastEssay': last_essay_data } else: data = { 'username': temp_user.username, 'selfIntro': '该用户来去无风,没有留下过签名~ ~ ~ ', 'lastArticle': last_article_data, 'lastEssay': last_essay_data } return jsonify(data)
def create(self, *args, **kwargs): if 'title' in kwargs and (isinstance(kwargs['title'], str) or isinstance(kwargs['title'], str)): kwargs['title'] = clean_html(kwargs['title']) else: raise Exception("Title is missing - RSSArticle Manager") if 'link' in kwargs and (isinstance(kwargs['link'], str) or isinstance(kwargs['link'], str)): if RSSArticle.objects.filter(rssfeed=kwargs['rssfeed'], link=kwargs['link']).exists(): raise Exception( "RSS Article already exists in database - RSSArticle Manager" ) else: raise Exception("Link is missing - RSSFeed Manager") return super(RSSArticleManager, self).create(*args, **kwargs)
# get_all_keywords用到的包 import jieba from get_word2vec import get_word2vec from pdfminer2txt import get_pdf2txt import catchkeywords import numpy as np import re import clean_html import get_all_keywords filename = 'test' filepath = 'test.pdf' stopwords_path = 'word2vec_format.txt' html_file = 'pdf2html/test.html' usr_keyword = ['计算机', '人工智能'] html_doc = clean_html.clean_html() # 导入关键词列表,以备遍历 final_keywords = get_all_keywords.get_final_keywords(filepath, stopwords_path, usr_keyword) def indexstr(str1, str2): # 查找指定字符串str1包含指定子字符串str2的全部位置,以列表形式返回 lenth2 = len(str2) lenth1 = len(str1) indexstr2 = [] i = 0 while str2 in str1[i:]: indextmp = str1.index(str2, i, lenth1) indexstr2.append(indextmp)
b = len(a) - 2 c = a[b:] d = ''.join(c) if d == "//": b = len(a) - 1 c = a[:b] print(''.join(c)) clean_links.append(''.join(c)) else: print(i) clean_links.append(i) #Since this output cleans all links, we can end up with a very large list. #So we will isolate only those links that include our target domain. e_links = [] for i in clean_links: if 'extension.org' in i: e_links.append(i) else: pass #Create csv document for later analysis for i in sorted(set(e_links)): link = i text = x.get_text_from_link( link) #this method does exactly what it is called. clean = clean_html(text) #Strip the html from the returned document row = [link, clean] file.writerow(row) #Write this data t = randrange(5) time.sleep(t)
def save(self, *args, **kwargs): if self.id is not None: self.title = clean_html(self.title) super(RSSArticle, self).save(*args, **kwargs) # Call the "real" save() method.
def get_sentences_from_html_v2(html, nlp=None): paras = clean_html(html) sentences = list(s for s in nlp_analyze(paras, nlp)) return sentences
a = list(i) b = len(a) - 2 c = a[b:] d = ''.join(c) if d == "//": b = len(a) - 1 c = a[:b] print(''.join(c)) clean_links.append(''.join(c)) else: print(i) clean_links.append(i) #Since this output cleans all links, we can end up with a very large list. #So we will isolate only those links that include our target domain. e_links = [] for i in clean_links: if 'extension.org' in i: e_links.append(i) else: pass #Create csv document for later analysis for i in sorted(set(e_links)): link = i text = x.get_text_from_link(link) #this method does exactly what it is called. clean = clean_html(text) #Strip the html from the returned document row = [link, clean] file.writerow(row) #Write this data t = randrange(5) time.sleep(t)