def _detect_story_images(ctx, story): image_processor = StoryImageProcessor(story.link, story.content) image_urls = _image_urls_of_indexs(image_processor.parse()) if not image_urls: return image_statuses = ImageInfo.batch_detect_images(image_urls) num_todo_image_urls = 0 todo_url_roots = defaultdict(list) for url in image_urls: status = image_statuses.get(url) if status is None: num_todo_image_urls += 1 url_root = ImageInfo.extract_url_root(url) todo_url_roots[url_root].append(url) LOG.info( f'story#{story.feed_id},{story.offset} {story.link} has {len(image_urls)} images, ' f'need detect {num_todo_image_urls} images ' f'from {len(todo_url_roots)} url_roots' ) if todo_url_roots: todo_urls = [] for items in todo_url_roots.values(): if len(items) > 3: todo_urls.extend(random.sample(items, 3)) else: todo_urls.extend(items) ctx.hope('worker_rss.detect_story_images', dict( feed_id=story.feed_id, offset=story.offset, story_url=story.link, image_urls=list(set(todo_urls)), )) else: _replace_story_images(feed_id=story.feed_id, offset=story.offset)
def _replace_story_images(feed_id, offset): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) image_processor = StoryImageProcessor(story.link, story.content) image_indexs = image_processor.parse() image_urls = _image_urls_of_indexs(image_indexs) if not image_urls: return image_statuses = ImageInfo.batch_detect_images(image_urls) image_replaces = {} for url, status in image_statuses.items(): if status in IMAGE_REFERER_DENY_STATUS: new_url_data = encode_image_url(url, story.link) image_replaces[url] = '/api/v1/image/{}?{}'.format(new_url_data, RSSANT_IMAGE_TAG) LOG.info(f'story#{feed_id},{offset} {story.link} ' f'replace {len(image_replaces)} referer deny images') # image_processor.process will (1) fix relative url (2) replace image url # call image_processor.process regardless of image_replaces is empty or not content = image_processor.process(image_indexs, image_replaces) STORY_SERVICE.update_story(feed_id, offset, {'content': content})
def _replace_story_images(story_id): story = Story.objects.get(pk=story_id) image_processor = StoryImageProcessor(story.link, story.content) image_indexs = image_processor.parse() image_urls = _image_urls_of_indexs(image_indexs) if not image_urls: return image_statuses = ImageInfo.batch_detect_images(image_urls) image_replaces = {} for url, status in image_statuses.items(): if status in IMAGE_REFERER_DENY_STATUS: new_url_data = encode_image_url(url, story.link) image_replaces[url] = '/api/v1/image/{}?{}'.format(new_url_data, RSSANT_IMAGE_TAG) LOG.info(f'story#{story_id} {story.link} ' f'replace {len(image_replaces)} referer deny images') # image_processor.process will (1) fix relative url (2) replace image url # call image_processor.process regardless of image_replaces is empty or not content = image_processor.process(image_indexs, image_replaces) story.content = content story.save()