def parse_url(cls, url): if url.isdigit(): return url match = POST_REGEXP.match(url) if match: return match.group('post_id') match = CREATOR_REGEXP.match(url) if match: return hoordu.Dynamic({'vanity': match.group('vanity')}) return None
def parse_url(cls, url): if url.isdigit(): return url match = POST_REGEXP.match(url) if match: return match.group('post_id') match = FANCLUB_REGEXP.match(url) if match: return hoordu.Dynamic({'creator_id': match.group('fanclub_id')}) return None
def parse_url(cls, url): if url.isdigit(): return url for regexp in POST_REGEXP: match = regexp.match(url) if match: return match.group('post_id') for regexp in CREATOR_REGEXP: match = regexp.match(url) if match: return hoordu.Dynamic({'creator': match.group('creator')}) return None
def parse_url(cls, url): if url.isdigit(): return url for regexp in TWEET_REGEXP: match = regexp.match(url) if match: return match.group('tweet_id') match = TIMELINE_REGEXP.match(url) if match: user = match.group('user') method = match.group('type') if method != 'likes': method = 'tweets' return hoordu.Dynamic({ 'user': user, 'method': method }) return None
def _to_remote_post(self, post, remote_post=None, preview=False): main_id = post.id creator_id = post.user.userId creator_slug = post.creatorId creator_name = post.user.name # possible timezone issues? post_time = dateutil.parser.parse(post.publishedDatetime).astimezone( timezone.utc) if remote_post is None: remote_post = self._get_post(main_id) if remote_post is None: metadata = hoordu.Dynamic() if post.feeRequired != 0: metadata.price = post.feeRequired remote_post = RemotePost( source=self.source, original_id=main_id, url=POST_FORMAT.format(creator=creator_slug, post_id=main_id), title=post.title, type=PostType.collection, post_time=post_time, metadata_=metadata.to_json()) self.session.add(remote_post) self.session.flush() self.log.info(f'downloading post: {remote_post.original_id}') self.log.info(f'local id: {remote_post.id}') if post.isLiked is True: remote_post.favorite = True # creators are identified by their pixiv id because their name and creatorId can change creator_tag = self._get_tag(TagCategory.artist, creator_id) remote_post.add_tag(creator_tag) if any((creator_tag.update_metadata('name', creator_name), creator_tag.update_metadata('slug', creator_slug))): self.session.add(creator_tag) for tag in post.tags: remote_tag = self._get_tag(TagCategory.general, tag) remote_post.add_tag(remote_tag) if post.hasAdultContent is True: nsfw_tag = self._get_tag(TagCategory.meta, 'nsfw') remote_post.add_tag(nsfw_tag) current_files = {file.metadata_: file for file in remote_post.files} current_urls = [r.url for r in remote_post.related] if post.type == 'image': for image, order in zip(post.body.images, itertools.count(1)): id = 'i-{}'.format(image.id) file = current_files.get(id) if file is None: file = File(remote=remote_post, remote_order=order, metadata_=id) self.session.add(file) self.session.flush() else: file.remote_order = order self.session.add(file) need_orig = not file.present and not preview need_thumb = not file.thumb_present if need_thumb or need_orig: self.log.info(f'downloading file: {file.remote_order}') orig = self._download_file( image.originalUrl) if need_orig else None thumb = self._download_file( image.thumbnailUrl) if need_thumb else None self.session.import_file(file, orig=orig, thumb=thumb, move=True) remote_post.comment = post.body.text self.session.add(remote_post) elif post.type == 'file': for rfile, order in zip(post.body.files, itertools.count(1)): id = 'f-{}'.format(rfile.id) file = current_files.get(id) if file is None: filename = '{0.name}.{0.extension}'.format(rfile) file = File(remote=remote_post, remote_order=order, filename=filename, metadata_=id) self.session.add(file) self.session.flush() else: file.remote_order = order self.session.add(file) need_orig = not file.present and not preview if need_orig: self.log.info(f'downloading file: {file.remote_order}') orig = self._download_file(rfile.url) self.session.import_file(file, orig=orig, move=True) remote_post.comment = post.body.text self.session.add(remote_post) elif post.type == 'article': imagemap = post.body.get('imageMap') filemap = post.body.get('fileMap') embedmap = post.body.get('embedMap') order = 1 blog = [] for block in post.body.blocks: if block.type in ('p', 'header'): links = block.get('links') if links is not None: for link in links: url = link.url if url not in current_urls: remote_post.add_related_url(url) blog.append({'type': 'text', 'content': block.text + '\n'}) elif block.type == 'image': id = 'i-{}'.format(block.imageId) file = current_files.get(id) if file is None: file = File(remote=remote_post, remote_order=order, metadata_=id) self.session.add(file) self.session.flush() else: file.remote_order = order self.session.add(file) orig_url = imagemap[block.imageId].originalUrl thumb_url = imagemap[block.imageId].thumbnailUrl need_orig = not file.present and not preview need_thumb = not file.thumb_present if need_thumb or need_orig: self.log.info(f'downloading file: {file.remote_order}') orig = self._download_file( orig_url) if need_orig else None thumb = self._download_file( thumb_url) if need_thumb else None self.session.import_file(file, orig=orig, thumb=thumb, move=True) blog.append({'type': 'file', 'metadata': id}) order += 1 elif block.type == 'file': id = 'f-{}'.format(block.fileId) file = current_files.get(id) if file is None: file = File(remote=remote_post, remote_order=order, metadata_=id) self.session.add(file) self.session.flush() orig_url = filemap[block.fileId].url thumb_url = post.coverImageUrl need_orig = not file.present and not preview need_thumb = not file.thumb_present and thumb_url is not None if need_thumb or need_orig: self.log.info(f'downloading file: {file.remote_order}') orig = self._download_file( orig_url) if need_orig else None thumb = self._download_file( thumb_url) if need_thumb else None self.session.import_file(file, orig=orig, thumb=thumb, move=True) blog.append({'type': 'file', 'metadata': id}) order += 1 elif block.type == 'embed': embed = embedmap[block.embedId] if embed.serviceProvider == 'fanbox': related_post_id = embed.contentId.split('/')[-1] url = POST_FORMAT.format(post_id=related_post_id) elif embed.serviceProvider == 'google_forms': url = 'https://docs.google.com/forms/d/e/{}/viewform'.format( embed.contentId) elif embed.serviceProvider == 'twitter': url = 'https://twitter.com/i/web/status/{}'.format( embed.contentId) else: raise NotImplementedError( 'unknown embed service provider: {}'.format( embed.serviceProvider)) if url not in current_urls: remote_post.add_related_url(url) blog.append({'type': 'text', 'content': url + '\n'}) else: self.log.warning('unknown blog block: %s', str(block.type)) remote_post.comment = hoordu.Dynamic({'comment': blog}).to_json() remote_post.type = PostType.blog self.session.add(remote_post) elif post.type == 'text': remote_post.comment = post.body.text remote_post.type = PostType.set self.session.add(remote_post) else: raise NotImplementedError('unknown post type: {}'.format( post.type)) return remote_post
def tweet_to_remote_post(self, tweet, remote_post=None, preview=False): # get the original tweet if this is a retweet if tweet.retweeted_status is not None: tweet = tweet.retweeted_status original_id = tweet.id_str user = tweet.user.screen_name user_id = tweet.user.id_str text = tweet.full_text post_time = datetime.utcfromtimestamp(tweet.created_at_in_seconds) if remote_post is None: remote_post = self._get_post(original_id) if remote_post is None: remote_post = RemotePost( source=self.source, original_id=original_id, url=TWEET_FORMAT.format(user=user, tweet_id=original_id), comment=text, type=PostType.set, post_time=post_time, metadata_=hoordu.Dynamic({'user': user}).to_json() ) self.session.add(remote_post) self.session.flush() self.log.info(f'downloading post: {remote_post.original_id}') self.log.info(f'local id: {remote_post.id}') remote_post.comment = text remote_post.favorite = tweet.favorited is True user_tag = self._get_tag(TagCategory.artist, user_id) remote_post.add_tag(user_tag) if user_tag.update_metadata('user', user): self.session.add(user_tag) if tweet.possibly_sensitive: nsfw_tag = self._get_tag(TagCategory.meta, 'nsfw') remote_post.add_tag(nsfw_tag) if tweet.hashtags is not None: for hashtag in tweet.hashtags: tag = hashtag.text nsfw_tag = self._get_tag(TagCategory.general, tag) remote_post.add_tag(nsfw_tag) if tweet.in_reply_to_status_id is not None: url = TWEET_FORMAT.format(user=tweet.in_reply_to_screen_name, tweet_id=tweet.in_reply_to_status_id) remote_post.add_related_url(url) if tweet.urls is not None: for url in tweet.urls: if SUPPORT_URL_REGEXP.match(url.url): raise APIError(text) # the unwound section is a premium feature self.log.info(f'unwinding: {url.url}') final_url = self._unwind_url(url.url) remote_post.add_related_url(final_url) self.session.add(remote_post) if tweet.media is not None: available = set(range(len(tweet.media))) present = set(file.remote_order for file in remote_post.files) for order in available - present: file = File(remote=remote_post, remote_order=order) self.session.add(file) self.session.flush() for file in remote_post.files: need_thumb = not file.thumb_present need_file = not file.present and not preview if need_thumb or need_file: self.log.info(f'downloading file: {file.remote_order}') media = tweet.media[file.remote_order] thumb = None orig = None base_url, ext = media.media_url_https.rsplit('.', 1) filename = '{}.{}'.format(base_url.rsplit('/', 1)[-1], ext) if media.type == 'photo': if need_thumb: thumb = self._download_media_file(base_url, ext, THUMB_SIZE, filename) if need_file: orig = self._download_media_file(base_url, ext, ORIG_SIZE, filename) self.session.import_file(file, orig=orig, thumb=thumb, move=True) file.ext = ext file.thumb_ext = ext self.session.add(file) elif media.type == 'video' or media.type == 'animated_gif': if need_thumb: thumb = self._download_media_file(base_url, ext, THUMB_SIZE, filename) if need_file: orig = self._download_video(media) self.session.import_file(file, orig=orig, thumb=thumb, move=True) file.thumb_ext = ext self.session.add(file) return remote_post
def parse_args(hrd): # parse arguments args = hoordu.Dynamic() args.plugin_id = None args.command = None args.urls = [] args.subscription = None args.num_posts = None args.disabled = False argi = 1 sargi = 0 # sub argument count while argi < argc: arg = sys.argv[argi] argi += 1 # global commands if arg == '-h' or arg == '--help': usage() sys.exit(0) elif arg == '-p' or arg == '--plugin': args.plugin_id = sys.argv[argi] argi += 1 elif arg == '-d' or arg == '--disabled': args.disabled = True elif args.command is None: # pick command, or append to list or urls if arg in ('setup', 'list', 'enable', 'disable', 'update', 'fetch', 'rfetch', 'related'): args.command = arg sargi = 0 else: args.urls.append(parse_url(hrd, arg, args.plugin_id)) else: # sub-command arguments if args.command in ('enable', 'disable', 'update') and sargi < 1: parse_sub_name(arg, args) sargi += 1 elif args.command in ('fetch', 'rfetch') and sargi < 2: if sargi == 0: parse_sub_name(arg, args) else: args.num_posts = int(arg) sargi += 1 elif args.command in ('related') and sargi < 2: args.urls.append(parse_url(hrd, arg, args.plugin_id)) sargi += 1 else: fail(f'unknown argument: {arg}') # verify arguments urlc = len(args.urls) if urlc >= 2: for id, options in args.urls: if isinstance(options, hoordu.Dynamic): fail('can only process one search url at a time') if args.command == 'related' and urlc <= 1: fail('the related sub-command requires at least 2 urls') if args.command in ('list', 'setup') and args.plugin_id is None: fail(f'{args.command} sub-command requires a plugin to be specified') if args.command in ('enable', 'disable', 'fetch', 'rfetch') and args.subscription is None: fail( f'{args.command} sub-command requires a subscription to be specified' ) if args.command == 'update' and args.subscription is None and args.plugin_id is None: fail( f'update sub-command requires a plugin or a subscription to be specified' ) return args
def _content_to_post(self, post, content, remote_post=None, preview=False): content_id = '{post_id}-{content_id}'.format(post_id=post.id, content_id=content.id) creator_id = str(post.fanclub.id) creator_name = post.fanclub.user.name # possible timezone issues? post_time = dateutil.parser.parse(post.posted_at).astimezone( timezone.utc) if remote_post is None: remote_post = self._get_post(content_id) if remote_post is None: metadata = hoordu.Dynamic() if content.plan is not None: metadata.price = content.plan.price remote_post = RemotePost(source=self.source, original_id=content_id, url=POST_FORMAT.format(post_id=post.id), title=content.title, comment=content.comment, type=PostType.collection, post_time=post_time, metadata_=metadata.to_json()) self.session.add(remote_post) self.session.flush() self.log.info(f'downloading post: {remote_post.original_id}') self.log.info(f'local id: {remote_post.id}') if post.liked is True: remote_post.favorite = True # creators are identified by their id because their name can change creator_tag = self._get_tag(TagCategory.artist, creator_id) remote_post.add_tag(creator_tag) if creator_tag.update_metadata('name', creator_name): self.session.add(creator_tag) for tag in post.tags: remote_tag = self._get_tag(TagCategory.general, tag.name) remote_post.add_tag(remote_tag) if post.rating == 'adult': nsfw_tag = self._get_tag(TagCategory.meta, 'nsfw') remote_post.add_tag(nsfw_tag) if content.category == 'file': if len(remote_post.files) == 0: file = File(remote=remote_post, remote_order=0, filename=content.filename) self.session.add(file) self.session.flush() else: file = remote_post.files[0] need_orig = not file.present and not preview if need_orig: self.log.info(f'downloading file: {content.filename}') orig_url = FILE_DOWNLOAD_URL.format( download_uri=content.download_uri) orig = self._download_file(orig_url, filename=content.filename) self.session.import_file(file, orig=orig, move=True) elif content.category == 'photo_gallery': current_files = { file.metadata_: file for file in remote_post.files } order = 0 for photo in content.post_content_photos: photo_id = str(photo.id) file = current_files.get(photo_id) if file is None: file = File(remote=remote_post, metadata_=photo_id, remote_order=order) self.session.add(file) self.session.flush() elif file.remote_order != order: file.remote_order = order self.session.add(file) need_orig = not file.present and not preview need_thumb = not file.thumb_present if need_thumb or need_orig: self.log.info(f'downloading file: {file.remote_order}') orig = self._download_file( photo.url.original) if need_orig else None thumb = self._download_file( photo.url.medium) if need_thumb else None self.session.import_file(file, orig=orig, thumb=thumb, move=True) order += 1 elif content.category == 'text': # there are no files to save remote_post.type = PostType.set self.session.add(remote_post) elif content.category == 'blog': current_files = { file.remote_order: file for file in remote_post.files } sections = hoordu.Dynamic.from_json(content.comment).ops blog = [] order = 0 for section in sections: insert = section.insert if isinstance(insert, str): blog.append({'type': 'text', 'content': insert}) elif isinstance(insert, hoordu.Dynamic): fantiaImage = insert.get('fantiaImage') if fantiaImage is not None: photo_id = str(fantiaImage.id) file = current_files.get(photo_id) if file is None: file = File(remote=remote_post, metadata_=photo_id, remote_order=order) self.session.add(file) self.session.flush() orig_url = FILE_DOWNLOAD_URL.format( download_uri=fantiaImage.original_url) thumb_url = fantiaImage.url need_orig = not file.present and not preview need_thumb = not file.thumb_present if need_thumb or need_orig: self.log.info( f'downloading file: {file.remote_order}') orig = self._download_file( orig_url) if need_orig else None thumb = self._download_file( thumb_url) if need_thumb else None self.session.import_file(file, orig=orig, thumb=thumb, move=True) blog.append({'type': 'file', 'metadata': photo_id}) order += 1 else: self.log.warning(f'unknown blog insert: {str(insert)}') remote_post.comment = hoordu.Dynamic({'comment': blog}).to_json() remote_post.type = PostType.blog self.session.add(remote_post) else: raise NotImplementedError('unknown content category: {}'.format( content.category)) return remote_post