def handle(self, *args, **options): if options['group'] != 1 and options['group'] != 2 and options[ 'group'] is not None: otapick.print_console('groupID {} is not supported.'.format( options['group'])) quit() member_image_crawler = otapick.MemberImageCrawler() member_image_crawler_ex = otapick.MemberImageCrawlerEx() member_image_downloader = otapick.MemberImageDownloader() member_image_downloader_ex = otapick.MemberImageDownloaderEx() members = Member.objects.filter(temporary=False, is_other=False) members = members if options['group'] is None else members.filter( belonging_group__group_id=options['group']) for member in members: if not member.image or (options['force'] and not member.graduate) or options['fforce']: if not member.graduate: image_url = member_image_crawler.crawl( group_key=member.belonging_group.key, ct=member.ct) media = member_image_downloader.download( image_url, member.belonging_group.group_id, member.ct) else: image_url = member_image_crawler_ex.crawl( member.full_kanji) media = member_image_downloader_ex.download( image_url, member.belonging_group.group_id, member.ct) if media is not None: member.image = media member.save() otapick.print_console("{}'s image is saved!!".format( member.full_kanji))
def get_article_tag(group_key, soup): if group_key == 'keyaki' or group_key == 'sakura': article_tag = soup.find('div', class_='box-article') elif group_key == 'hinata': article_tag = soup.find('div', class_='c-blog-article__text') ### Edit ### else: return if article_tag is None: otapick.print_console('article_tag not found') return article_tag
def get_blog_tag(group_key, soup): if group_key == 'keyaki': blog_tag = soup.select_one('article') elif group_key == 'hinata': blog_tag = soup.select_one('div.p-blog-article') elif group_key == 'sakura': blog_tag = soup.select_one('article.post') ### Edit ### else: return if blog_tag is None: otapick.print_console('blog_tag not found') return blog_tag
def get_blog_title_tag(group_key, blog_tag): if group_key == 'keyaki': title_tag = blog_tag.select_one('.box-ttl a') elif group_key == 'hinata': title_tag = blog_tag.select_one('div.c-blog-article__title') elif group_key == 'sakura': title_tag = blog_tag.select_one('h1.title') ### Edit ### else: return if title_tag is None: otapick.print_console('title_tag not found') return title_tag
def get_blog_tags(group_key, soup): if group_key == 'keyaki': blog_tags = soup.select('article') elif group_key == 'hinata': blog_tags = soup.select('div.p-blog-article') elif group_key == 'sakura': blog_tags = soup.select('ul.com-blog-part > li.box') ### Edit ### else: return if blog_tags is None: otapick.print_console('blog_tags not found') return blog_tags
def get_blog_writer_name(group_key, blog_tag): if group_key == 'keyaki': writer_name = blog_tag.select_one('div.box-ttl > p.name').text elif group_key == 'hinata': writer_name = blog_tag.select_one( 'div.p-blog-article__info > div.c-blog-article__name').text elif group_key == 'sakura': writer_name = blog_tag.select_one('div.blog-foot p.name').text ### Edit ### else: return if writer_name is None: otapick.print_console('writer_name not found') return writer_name
def handle(self, *args, **options): if options['reverse']: otapick.shift_score(blogs=Blog.objects.all(), order=False) otapick.shift_score(images=Image.objects.all(), order=False) else: start = time.time() otapick.shift_score(blogs=Blog.objects.all(), order=True) otapick.print_console('finished shift_per_day blogs!!: {}s'.format( round(time.time() - start, 2))) start = time.time() otapick.shift_score(images=Image.objects.all(), order=True) otapick.print_console( 'finished shift_per_day images!!: {}s'.format( round(time.time() - start, 2)))
def handle(self, *args, **options): if options['recommend']: high_score_members, divided_blogs, divided_images = otapick.init_calc_recommend_score( ) start = time.time() otapick.calc_recommend_score(high_score_members, divided_blogs=divided_blogs) otapick.print_console( 'finished calc_recommend_score blogs!!: {}s'.format( round(time.time() - start, 2))) start = time.time() otapick.calc_recommend_score(high_score_members, divided_images=divided_images) otapick.print_console( 'finished calc_recommend_score images!!: {}s'.format( round(time.time() - start, 2))) else: start = time.time() otapick.calc_score(blogs=Blog.objects.all()) otapick.print_console('finished calc_score blogs!!: {}s'.format( round(time.time() - start, 2))) start = time.time() otapick.calc_score(images=Image.objects.all()) otapick.print_console('finished calc_score images!!: {}s'.format( round(time.time() - start, 2))) # tweet popularity top 3 images if options['tweet']: for group in Group.objects.filter(is_active=True): otapick.PopularityBot().tweet(group_id=group.group_id)
def get_blog_url(group_key, blog_tag): if group_key == 'keyaki': bottomul_tag = blog_tag.select_one('div.box-bottom ul') bottomli_tags = bottomul_tag.select('li') blog_url = bottomli_tags[1].find('a').get('href') elif group_key == 'hinata': blog_url = blog_tag.select_one('div.p-button__blog_detail').find( 'a').get('href') elif group_key == 'sakura': blog_url = blog_tag.find('a').get('href') ### Edit ### else: return if blog_url is None: otapick.print_console('blog_url not found') return blog_url
def get_blog_postdate_tag(group_key, blog_tag): if group_key == 'keyaki': bottomul_tag = blog_tag.select_one('div.box-bottom ul') bottomli_tags = bottomul_tag.select('li') postdate_tag = bottomli_tags[0] elif group_key == 'hinata': postdate_tag = blog_tag.select_one( 'div.p-blog-article__info > div.c-blog-article__date') elif group_key == 'sakura': postdate_tag = blog_tag.select_one('div.blog-foot p.date.wf-a') ### Edit ### else: return if postdate_tag is None: otapick.print_console('postdate_tag not found') return postdate_tag
def get_member_image_tag(group_key, soup): if group_key == 'keyaki': image_tag = soup.find('div', class_='box-profile_img') elif group_key == 'hinata': image_tag = soup.find('div', class_='c-member__thumb') elif group_key == 'sakura': image_tag = soup.find('p', class_='ph') ### Edit ### else: return if image_tag is None: otapick.print_console( 'image_tag_wrapper not found. 指定したctのメンバーのプロフィールページが存在しない場合がございます。' ) image_tag = image_tag.find('img') if image_tag is None: otapick.print_console('image_tag not found') return image_tag
def exe_unregistration(blog, group_id, group_key): sleep_time_unregister = 1 blog_info = otapick.BlogDetailCrawler().crawl( group_key=group_key, blog_ct=blog.blog_ct) if blog_info is None: return elif blog_info == 404: otapick.print_console(str(blog.blog_ct) + "/" + str(group_id) + ' blog is not found in official blog. unregister this.') blog.delete() else: otapick.print_console(str(blog.blog_ct) + "/" + str(group_id) + ' blog is found in official blog. leave this. and execute get_blogs -p 5 -a -t') # ページ中間位置に未取得のブログがあった場合、延々ここが呼ばれてしまうため、対処↓ otapick.register_blogs( group_id=group_id, up_limit=5, all_check=True, tweet=True) time.sleep(sleep_time_unregister)
def unregister(correct_cts_list, group, unregister_num): group_id = group.group_id paginate_by = group.blog_list_paginate_by blog_crawler = otapick.BlogListCrawler() groups = Group.objects.filter(group_id=group_id) if not groups.exists(): return group_key = groups.first().key for page in range(unregister_num): # unregister_numを2以上で設定していた時、1ページ目で登録処理が終了した場合など if len(correct_cts_list) <= page and len(correct_cts_list) < unregister_num: blogs_data = blog_crawler.crawl(group_key, page) correct_cts_list.append([blog_info['blog_ct'] for blog_info in blogs_data]) for blog in Blog.objects.filter(publishing_group__group_id=group_id, writer__graduate=False).order_by('-post_date', 'order_for_simul')[paginate_by*page:paginate_by*(page+1)]: if not blog.blog_ct in correct_cts_list[page]: otapick.print_console(str(blog.blog_ct) + "/" + str(group_id) + ' blog is not found in official blog on page ' + str(page) + '.') otapick.print_console('Investigate in detail...') exe_unregistration(blog, group_id, group_key)
def handle(self, *args, **options): threshold = 10 # 10より小さいときのみ更新 blog_num_of_views_range = (10, 50) image_num_of_views_range = (10, 50) image_num_of_downloads_range = (10, 30) if not options['date']: otapick.print_console('date option is required.') return try: fromDate = datetime.strptime(options['date'], '%Y/%m/%d') except: otapick.print_console('date format is wrong.') return today = timezone.now() blogs = Blog.objects.filter(post_date__range=(fromDate, today)) images = Image.objects.filter(publisher__post_date__range=(fromDate, today)) for blog in blogs: if not blog.num_of_views > threshold: # num_of_views fake_blog_num_of_views = random.randint( *blog_num_of_views_range) otapick.increment_num_of_views(blog=blog, num=fake_blog_num_of_views) otapick.print_console(f'add fake score「{blog.title}」!!') for image in images: if not image.num_of_views > threshold: # num_of_views fake_image_num_of_views = random.randint( *image_num_of_views_range) otapick.increment_num_of_views(image=image, num=fake_image_num_of_views) if not image.num_of_downloads > threshold: # num_of_downloads fake_image_num_of_downloads = random.randint( *image_num_of_downloads_range) otapick.increment_num_of_downloads( image, image.publisher, num=fake_image_num_of_downloads) otapick.print_console( f'add fake score「{image.publisher}」({image.order}) !!')
def handle(self, *args, **options): # 1 blog_detail_crawler = otapick.BlogDetailCrawler() for temporary_member_info in self.independent_temporary_cts: if Member.objects.filter(belonging_group__group_id=temporary_member_info[0], ct=temporary_member_info[1]): temporary_member = Member.objects.get(belonging_group__group_id=temporary_member_info[0], ct=temporary_member_info[1]) for blog in Blog.objects.filter(writer=temporary_member): blog_info = blog_detail_crawler.crawl(group_key=temporary_member.belonging_group.key, blog_ct=blog.blog_ct) if blog_info is not None: blog.writer = blog_info['member'] blog.save() otapick.print_console('「{}」 changed writer {}'.format(blog.title, blog_info['member'])) else: otapick.print_console('blog crawl failed...') else: if not Blog.objects.filter(writer=temporary_member).exists(): otapick.print_console('success writer({}) migration work!!'.format(temporary_member.full_kanji)) otapick.print_console('delete {}!!'.format(temporary_member.full_kanji)) temporary_member.delete()
def exe_registration(blog_info_list, post_date, group_id, all_check, tweet, console): """ ブログの登録処理 Args: blog_info_list (list): ブログ情報のリスト。前提としてリストの中のブログは同じpost_dateを持つ。 post_date (date): 共通のpost_date group_id (int): グループID all_check (bool): 保存済みのブログを見つけても処理を実行 tweet (bool): 更新通知をtweetするか否か console (bool): ログ出力するか否か Returns: True(登録処理終了), False(登録処理続行) """ download_count = 0 blog_objects = [] image_objects = [] for i, blog_info in enumerate(blog_info_list): # new blog if not Blog.objects.filter(blog_ct=blog_info['blog_ct'], publishing_group__group_id=group_id).exists(): blog = Blog( blog_ct=blog_info['blog_ct'], title=blog_info['title'], post_date=post_date, order_for_simul=i, writer=blog_info['member'], publishing_group=Group.objects.filter( group_id=group_id).first(), ) blog_objects.append(blog) download_count += 1 # already saved else: blog = Blog.objects.get( blog_ct=blog_info['blog_ct'], publishing_group__group_id=group_id) if len(blog_info['image_urls']) > 0: order = 0 for image_url in blog_info['image_urls']: if not Image.objects.filter(publisher=blog).exists(): media = otapick.BlogImageDownloader().download( image_url, group_id, blog.blog_ct, blog.writer.ct) if media == 'not_image': # exclude gif pass elif media is not None: image = Image( order=order, picture=media, publisher=blog, ) # set width & height w, h = otapick.get_image_w_h(image) image.width = w image.height = h image_objects.append(image) order += 1 else: import traceback traceback.print_exc() # change the order_for_simul of already saved blog with the same post_date if Blog.objects.filter(post_date=post_date).exists(): for saved_simultime_blog in Blog.objects.filter(post_date=post_date): saved_simultime_blog.order_for_simul += download_count saved_simultime_blog.save() # save new blog for blog_object in blog_objects: blog_object.save() if console: otapick.print_console( 'register 「' + blog_object.title + '」 written by ' + blog_object.writer.full_kanji) # save new image for image_object in image_objects: image_object.save() otapick.compress_blog_image(image_object) # tweet update info if tweet: updateBot = otapick.UpdateBot() for blog_object in blog_objects: updateBot.tweet( group_id=blog_object.publishing_group.group_id, blog_ct=blog_object.blog_ct) # When there is at least one already saved blog in blog_list and all_check is False if download_count != len(blog_info_list) and not all_check: return True # When all blog in blog_list are new or when all_check is True else: return False
def register_blogs(group_id, up_limit=100, all_check=False, unregister_num=1, tweet=False): """ グループごとにブログの登録を行う。以下、処理の流れ。 最新から次々チェックしていき、保存済みのブログに当たったら終了。また、一番最後まで行ったら終了。これまでを仮にプロセスと呼ぶ。 プロセス間はをリスト(simultime_blogs)を所持して毎ブログ、リストにappendしていく。次のブログが先ほどのブログと違った時間なら解放(1)。同時刻ならappendする。(1)の解放のタイミングでリストの中のblogを登録する。 この時、保存済みのblogに同時刻があるか調べる。 1. なかったら 添字をそのままorder_for_simulに書き込み、全てのblogを登録。 2. あったら 添字をそのままorder_for_simulに書き込み、全てのblogを登録。加えて、リストのサイズ分だけ保存済みのblogのorder_for_simulに足す。 Args: group_id (int): グループID up_limit (int): 最大ページ(default: 100) all_check (bool): 保存済みのブログを見つけても処理を実行(default: False) unregister_num (int): 何ページ分、登録解除処理をするか(default: 1) tweet (bool): 更新通知をtweetするか否か Returns: True(success), None(failed) """ sleep_time_pagetransition = 3 simultime_blogs = [] simultime_post_date = "" correct_cts_list = [] # [[234, 3422, ...], [214, 423, ...]] blog_crawler = otapick.BlogListCrawler() groups = Group.objects.filter(group_id=group_id) group = groups.first() group_key = group.key for page, is_last in otapick.lastone(range(up_limit)): blogs_data = blog_crawler.crawl(group_key, page) if blogs_data is None: return elif len(blogs_data) == 0: otapick.print_console("register unacquired blog...") exe_registration(simultime_blogs, simultime_post_date, group_id, all_check, tweet, console=True) otapick.print_console("finished!!") break if len(correct_cts_list) < unregister_num: correct_cts = [blog_info['blog_ct'] for blog_info in blogs_data] correct_cts_list.append(correct_cts) for blog_info in blogs_data: # first time if not simultime_blogs and not simultime_post_date: simultime_blogs.append(blog_info) simultime_post_date = blog_info['post_date'] # When the post_date is same time as previous one, elif simultime_post_date == blog_info['post_date']: simultime_blogs.append(blog_info) # When the post_date isn't same time as previous one, else: finished = exe_registration( simultime_blogs, simultime_post_date, group_id, all_check, tweet, console=True) if finished: unregister(correct_cts_list, group, unregister_num) break simultime_blogs = [blog_info] simultime_post_date = blog_info['post_date'] else: if is_last: exe_registration(simultime_blogs, simultime_post_date, group_id, all_check, tweet, console=True) break time.sleep(sleep_time_pagetransition) otapick.print_console('go next page.') continue break return True
def get_member_image_tag_ex(soup): image_tag = soup.find('td', class_='infobox-image-wrapper') image_tag = image_tag.find('img') if image_tag is None: otapick.print_console('image_tag not found') return image_tag