def history(site, start_page_id): """ 過去ログ """ _limit = 20 svm = generate_index_contents(site) # パラメータチェック start_page_id = int(start_page_id) if start_page_id == 100000000: pages = Page.get_new_history(site_id=site.id, _limit=_limit) else: pages = Page.get_history(site_id=site.id, pk_until=start_page_id, _limit=_limit) # pageのランク付け pages = page_rank(pages) # 次のページの遷移先 is_next = None if pages and len(pages) == _limit: last_page_id = pages[-1].id is_next = last_page_id - 1 return render_template('dat/history.html', site=site, keyword=keyword, list_pages=pages, svm=svm, is_next=is_next)
def index(site, page_id): # パラメータチェックとメインコンテンツ生成 page_id = int(page_id) try: contents = Page.get_by_site(page_id, site.id) except Page.DoesNotExist: app_log(logging.ERROR, "Page does not exist site_id:{} page_id:{}".format(site.id, page_id)) return error_page(site, ErrorPageCategory.DoesNotExist) # ページが有効期間外ならエラー if not contents.is_enable(): app_log(logging.ERROR, "Page is not open site_id:{} page_id:{}".format(site.id, page_id)) return error_page(site, ErrorPageCategory.NotOpen) # 追加用ページ extend_page = contents.get_history_from_myself() if contents and contents.prev_page: ignore_ids = [page_id, contents.prev_page.id] else: ignore_ids = [page_id] try: svm = generate_index_contents(site, extend_page=extend_page, ignore_ids=ignore_ids) except SiteEmptyError: app_log(logging.WARNING, "site is empty site_id:{} page_id:{}".format(site.id, page_id)) return error_page(site, ErrorPageCategory.SiteIsEmpty) # pvを記録 if random.randint(0, 20) == 1: contents.count_up(20) return render_template('dat/page.html', contents=contents, site=site, svm=svm)
def update_at(self): from module.site.page import Page import datetime import pytz pages = Page.get_new_history(self.id) new_list = sorted(pages, key=lambda x: x.id, reverse=True) now = datetime.datetime.now(pytz.utc) for page in new_list: if page.is_enable(now): return page.open_at raise ValueError
def main(subject): # 読み込み posts = {} for posted in dat_reader(subject.dat_url): posts[posted.num] = posted # 読み込み後のパースとエラー排除とレスによる重み付け posts = analyze_post(posts) # キーワード解析 r_indexes = analyze_keyword(posts) # postsにキーワード解析内容を反映したあと、keywordデータをDBに一括登録 insert_keyword(posts, r_indexes, subject.site.id) # 評価高い投稿を出力 pages = [] for key in posts: output = PageRepository(_type=PageType.POST_RANK.value) if posts[key].priority > 300: # 評価の高い投稿を出力 print("++++++++++++++++++++") print(posts[key].priority) print("++++++++++++++++++++") posts[key].printer(posts=posts, output=output) # DB出力用に記録 pages.append(output) # キーワード評価が高い投稿を出力 for r_index in r_indexes: pages.append(printer_res(r_index, posts)) # dbに記録するレコードの生成 pages = filter_overlap(pages) keyword_record_dict = { r_index.keyword: r_index.keyword_record for r_index in r_indexes } bulk_pages = [ page.output_for_page(subject, keyword_record_dict) for page in pages if page.is_enable ] # バルク! pages = Page.bulk_insert(bulk_pages) PageKeywordRelation.register(pages) # 公開日を最適に設定する set_start_at(pages)
def get_pr_page(site, _limit=3): """ クローラー最適化のために、外部サイトのページを取得する :param site: site :return: list[Page] """ other_site = _get_other_site(site) pages = Page.get_new_history(other_site.id, _limit=30) pages = sorted(pages, key=lambda x: x.view_count, reverse=True) result = [] now = datetime.datetime.now(pytz.utc) for page in pages: if page.is_enable(now): result.append(page) if len(result) >= _limit: return result return result
def main(subject): # 読み込み posts = {} for posted in dat_reader(subject.dat_url): posts[posted.num] = posted # 読み込み後のパースとエラー排除とレスによる重み付け posts = analyze_post(posts) # キーワード解析 r_indexes = analyze_keyword(posts) # postsにキーワード解析内容を反映したあと、keywordデータをDBに一括登録 insert_keyword(posts, r_indexes, subject.site.id) # 評価高い投稿を出力 pages = [] for key in posts: output = PageRepository(_type=PageType.POST_RANK.value) if posts[key].priority > 300: # 評価の高い投稿を出力 print("++++++++++++++++++++") print(posts[key].priority) print("++++++++++++++++++++") posts[key].printer(posts=posts, output=output) # DB出力用に記録 pages.append(output) # キーワード評価が高い投稿を出力 for r_index in r_indexes: pages.append(printer_res(r_index, posts)) # dbに記録するレコードの生成 pages = filter_overlap(pages) keyword_record_dict = {r_index.keyword: r_index.keyword_record for r_index in r_indexes} bulk_pages = [page.output_for_page(subject, keyword_record_dict) for page in pages if page.is_enable] # バルク! pages = Page.bulk_insert(bulk_pages) PageKeywordRelation.register(pages) # 公開日を最適に設定する set_start_at(pages)
def sitemap(): """ googleクローラー用のsitemap.xml """ all_sites = Site.get_all() new_pages = Page.gets_new(10000) new_keywords = PageKeywordRelation.gets_new(10000) now = datetime.datetime.now(pytz.utc) - datetime.timedelta(seconds=3600) new_keyword_pages = [keyword for keyword in new_keywords if keyword.page and keyword.page.is_enable(now)] return render_template('sitemap/sitemap.html', url_base='http://www.niku.tokyo/', new_site_date=max([site.created_at for site in all_sites]), all_sites=all_sites, new_pages=new_pages, new_keyword_pages=new_keyword_pages, one_days_ago=datetime.datetime.now() - datetime.timedelta(days=1), three_days_ago=datetime.datetime.now() - datetime.timedelta(days=3), )
def run(self): # PageKeywordRelationに1行でもレコードあれば実行しない if PageKeywordRelation.objects().filter().count() > 0: raise AssertionError("PageKeywordRelation data is exist") # pageとkeywordのデータ全取得 page_all = Page.get_all() page_all = sorted(page_all, key=lambda x: x.id) # 10ページずつbulk count = 0 pages = [] for page in page_all: pages.append(page) count += 1 if len(pages) > 10: PageKeywordRelation.register(pages) pages = [] print("{}/{}".format(count, len(page_all))) PageKeywordRelation.register(pages)
def tests_page_models(): # # insert # page = Page(site_id=1, # dat_id=12345, # page="agraeg43g34qhg43qh43qh34") # page2 = Page(site_id=1, # dat_id=112345, # page="agraeg43g34qhg43qh43qh34") # # Page.bulk_insert([page, page2]) # # # update # page2.dat_id = 22222 # page2.save() all_pages = Page.objects().all() for page in all_pages: _id = page.site.get_background_image_id(page.id) assert 1 <= _id <= 5 print(_id) raise
def output_for_page(self, subject, keyword_record_dict): """ DB出力用のPageクラスを出力 :param subject: Subject :param keyword_record_dict: dict{int: Keyword} :rtype : Page """ s = ''.join( [post.generate_post_message_for_db() for post in self.output]) keyword_record_ids = [ keyword_record.id for keyword_record in self.get_keyword_record_ids(keyword_record_dict) ] page_top_post = '<br/>'.join(self.output[0].post_message_for_output) return Page( site_id=subject.site.id, dat_id=subject.dat_id, page=s, page_top=page_top_post, type=self.matome_type, _keywords=','.join([str(_id) for _id in keyword_record_ids]), )
def set_start_at(pages): """ 48時間の値を最適化して設定する :param pages: list(Page) :return: """ # 3件以下なら何もしない if len(pages) <= 3: return # 既に48時間先に30件以上予約がある場合は設定しない。 feature_page = Page.get_feature_page(pages[0].site_id) print(feature_page) if len(feature_page) >= 30: return # 最適化して並び替える _today_page = pages[2:] _tomorrow_page = pages[:2] _set_start_at(_today_page) _set_start_at(_tomorrow_page, time_shift=datetime.timedelta(hours=24))
# -*- coding: utf-8 -*- from module.scraping.search import SearchManager from module.site.page import Keyword, Page from module.site.site import Site for i in range(1, 10): print(Keyword.get(i)) for i in range(1, 10): print(Keyword.get(i)) page = Page.get(1) print(page.keywords) print(page.keywords) print(page.keywords) print(page.keywords) print(page.tile_label)
def generate_index_contents(site, _limit=30, extend_page=None, ignore_ids=()): """ トップページ表示用のデータを生成する :param site: Site :param _limit: int :param extend_page: list(Page) :param ignore_ids: list(int) :return: SiteViewModel """ pages = Page.get_new_history(site.id, _limit=_limit) if extend_page: pages += extend_page # 未来日公開の記事は公開しない now = datetime.datetime.now(pytz.utc) pages = [page for page in pages if page.is_enable(now=now)] if ignore_ids: pages_repository = {page.id: page for page in pages if page.id not in ignore_ids} else: pages_repository = {page.id: page for page in pages} pages = list(pages_repository.values()) # ページが存在しない if bool(pages) is False: raise SiteEmptyError # 10件未満 if len(pages) <= 10: pr_pages = get_pr_page(site) return SiteViewModel(site=site, contents=random.choice(pages), panels=[random.choice(pages) for x in range(6)], page_list=pages + pr_pages) # 最新の10件からviewが多い1件を取る new_list = sorted(pages, key=lambda x: x.id, reverse=True)[:10] new_list = sorted(new_list, key=lambda x: x.view_count, reverse=True) if len(new_list) == len([page for page in new_list if page.start_at]): new_list = sorted(new_list, key=lambda x: x.start_at, reverse=True) contents = new_list[0] new_list = new_list[1:] pages_repository.pop(contents.id) # 人気順に並んだ最新の9件からパネル用の3件を取る random.shuffle(new_list) panels = new_list[1:4] for panel_page in panels: panel_page.set_favorite(False) pages_repository.pop(panel_page.id) # 残りの46件からviewが多い3件を取る left_pages = list(pages_repository.values()) left_pages = sorted(left_pages, key=lambda x: x.view_count, reverse=True) for x in range(3): panel_page = left_pages.pop() panel_page.set_favorite(True) panels.append(panel_page) random.shuffle(panels) # 残りページをView数をベースに点数付与 for page in left_pages[0:3]: # 3件 page.set_view_level(PageViewLevel.SUPERNOVA) for page in left_pages[3:5]: # 2件 page.set_view_level(PageViewLevel.HOT) for page in left_pages[5:7]: # 2件 page.set_view_level(PageViewLevel.WARM) # 残りをidで降順ソートする left_pages = sorted(left_pages, key=lambda x:x.id, reverse=True) # クローラー用のPRページを追加 left_pages = left_pages[:20] left_pages += get_pr_page(site) return SiteViewModel(site=site, contents=contents, panels=panels, page_list=left_pages)
def page(self): return Page.get(self.page_id)
def generate_index_contents(site, _limit=30, extend_page=None, ignore_ids=()): """ トップページ表示用のデータを生成する :param site: Site :param _limit: int :param extend_page: list(Page) :param ignore_ids: list(int) :return: SiteViewModel """ pages = Page.get_new_history(site.id, _limit=_limit) if extend_page: pages += extend_page # 未来日公開の記事は公開しない now = datetime.datetime.now(pytz.utc) pages = [page for page in pages if page.is_enable(now=now)] if ignore_ids: pages_repository = { page.id: page for page in pages if page.id not in ignore_ids } else: pages_repository = {page.id: page for page in pages} pages = list(pages_repository.values()) # ページが存在しない if bool(pages) is False: raise SiteEmptyError # 10件未満 if len(pages) <= 10: pr_pages = get_pr_page(site) return SiteViewModel(site=site, contents=random.choice(pages), panels=[random.choice(pages) for x in range(6)], page_list=pages + pr_pages) # 最新の10件からviewが多い1件を取る new_list = sorted(pages, key=lambda x: x.id, reverse=True)[:10] new_list = sorted(new_list, key=lambda x: x.view_count, reverse=True) if len(new_list) == len([page for page in new_list if page.start_at]): new_list = sorted(new_list, key=lambda x: x.start_at, reverse=True) contents = new_list[0] new_list = new_list[1:] pages_repository.pop(contents.id) # 人気順に並んだ最新の9件からパネル用の3件を取る random.shuffle(new_list) panels = new_list[1:4] for panel_page in panels: panel_page.set_favorite(False) pages_repository.pop(panel_page.id) # 残りの46件からviewが多い3件を取る left_pages = list(pages_repository.values()) left_pages = sorted(left_pages, key=lambda x: x.view_count, reverse=True) for x in range(3): panel_page = left_pages.pop() panel_page.set_favorite(True) panels.append(panel_page) random.shuffle(panels) # 残りページをView数をベースに点数付与 for page in left_pages[0:3]: # 3件 page.set_view_level(PageViewLevel.SUPERNOVA) for page in left_pages[3:5]: # 2件 page.set_view_level(PageViewLevel.HOT) for page in left_pages[5:7]: # 2件 page.set_view_level(PageViewLevel.WARM) # 残りをidで降順ソートする left_pages = sorted(left_pages, key=lambda x: x.id, reverse=True) # クローラー用のPRページを追加 left_pages = left_pages[:20] left_pages += get_pr_page(site) return SiteViewModel(site=site, contents=contents, panels=panels, page_list=left_pages)