def main(subject): # 読み込み posts = {} for posted in dat_reader(subject.dat_url): posts[posted.num] = posted # 読み込み後のパースとエラー排除とレスによる重み付け posts = analyze_post(posts) # キーワード解析 r_indexes = analyze_keyword(posts) # postsにキーワード解析内容を反映したあと、keywordデータをDBに一括登録 insert_keyword(posts, r_indexes, subject.site.id) # 評価高い投稿を出力 pages = [] for key in posts: output = PageRepository(_type=PageType.POST_RANK.value) if posts[key].priority > 300: # 評価の高い投稿を出力 print("++++++++++++++++++++") print(posts[key].priority) print("++++++++++++++++++++") posts[key].printer(posts=posts, output=output) # DB出力用に記録 pages.append(output) # キーワード評価が高い投稿を出力 for r_index in r_indexes: pages.append(printer_res(r_index, posts)) # dbに記録するレコードの生成 pages = filter_overlap(pages) keyword_record_dict = { r_index.keyword: r_index.keyword_record for r_index in r_indexes } bulk_pages = [ page.output_for_page(subject, keyword_record_dict) for page in pages if page.is_enable ] # バルク! pages = Page.bulk_insert(bulk_pages) PageKeywordRelation.register(pages) # 公開日を最適に設定する set_start_at(pages)
def keyword(site, keyword_id, start_keyword_id): _limit = 25 # パラメータチェック keyword_id = int(keyword_id) start_keyword_id = int(start_keyword_id) keyword = Keyword.get(keyword_id) if start_keyword_id == 100000000: keyword_relation = PageKeywordRelation.get_from_new_keyword(keyword_id, _limit=_limit) else: keyword_relation = PageKeywordRelation.get_from_keyword(keyword_id, start_keyword_id, _limit=_limit) # pageが公開可能かチェックする now = datetime.datetime.now(tz=pytz.utc) keyword_relation = [r for r in keyword_relation if r.page.is_enable(now=now)] is_end = False if len(keyword_relation) >= 2: pages = [r.page for r in keyword_relation] contents = pages[0] prev_contents = pages[1] is_next = keyword_relation[1].id relations = keyword_relation[2:] elif len(keyword_relation) == 1: pages = [r.page for r in keyword_relation] contents = pages[0] prev_contents = None is_next = None is_end = True relations = keyword_relation[2:] else: contents = None prev_contents = None is_next = None relations = None return render_template('dat/keyword.html', site=site, keyword=keyword, contents=contents, prev_contents=prev_contents, relations=relations, is_next=is_next, is_end=is_end)
def main(subject): # 読み込み posts = {} for posted in dat_reader(subject.dat_url): posts[posted.num] = posted # 読み込み後のパースとエラー排除とレスによる重み付け posts = analyze_post(posts) # キーワード解析 r_indexes = analyze_keyword(posts) # postsにキーワード解析内容を反映したあと、keywordデータをDBに一括登録 insert_keyword(posts, r_indexes, subject.site.id) # 評価高い投稿を出力 pages = [] for key in posts: output = PageRepository(_type=PageType.POST_RANK.value) if posts[key].priority > 300: # 評価の高い投稿を出力 print("++++++++++++++++++++") print(posts[key].priority) print("++++++++++++++++++++") posts[key].printer(posts=posts, output=output) # DB出力用に記録 pages.append(output) # キーワード評価が高い投稿を出力 for r_index in r_indexes: pages.append(printer_res(r_index, posts)) # dbに記録するレコードの生成 pages = filter_overlap(pages) keyword_record_dict = {r_index.keyword: r_index.keyword_record for r_index in r_indexes} bulk_pages = [page.output_for_page(subject, keyword_record_dict) for page in pages if page.is_enable] # バルク! pages = Page.bulk_insert(bulk_pages) PageKeywordRelation.register(pages) # 公開日を最適に設定する set_start_at(pages)
def run(self): # PageKeywordRelationに1行でもレコードあれば実行しない if PageKeywordRelation.objects().filter().count() > 0: raise AssertionError("PageKeywordRelation data is exist") # pageとkeywordのデータ全取得 page_all = Page.get_all() page_all = sorted(page_all, key=lambda x: x.id) # 10ページずつbulk count = 0 pages = [] for page in page_all: pages.append(page) count += 1 if len(pages) > 10: PageKeywordRelation.register(pages) pages = [] print("{}/{}".format(count, len(page_all))) PageKeywordRelation.register(pages)
def sitemap(): """ googleクローラー用のsitemap.xml """ all_sites = Site.get_all() new_pages = Page.gets_new(10000) new_keywords = PageKeywordRelation.gets_new(10000) now = datetime.datetime.now(pytz.utc) - datetime.timedelta(seconds=3600) new_keyword_pages = [keyword for keyword in new_keywords if keyword.page and keyword.page.is_enable(now)] return render_template('sitemap/sitemap.html', url_base='http://www.niku.tokyo/', new_site_date=max([site.created_at for site in all_sites]), all_sites=all_sites, new_pages=new_pages, new_keyword_pages=new_keyword_pages, one_days_ago=datetime.datetime.now() - datetime.timedelta(days=1), three_days_ago=datetime.datetime.now() - datetime.timedelta(days=3), )
def get_count(cls, keyword_id): from module.site.page_keyword import PageKeywordRelation return PageKeywordRelation.get_count(keyword_id)