def get_custom_comment(tweet: dict) -> dict: keys = tweet.keys() extract_spec = { 'comment_text': ('text', str), 'comment_date': ('created_at', str), 'comment_id': ('id_str', str) } custom_comment = {} if 'retweeted_status' in keys: # retweet extract_spec['post_id'] = ('retweeted_status:id_str', str) custom_comment['type'] = 'retweet' extract_spec['metadata'] = ('retweeted_status', str) elif tweet.get('is_quote_status', False): # quote extract_spec['post_id'] = ('quoted_status_id_str', str) custom_comment['type'] = 'quote' extract_spec['metadata'] = ('quoted_status', str) elif tweet.get('in_reply_to_status_id', False): # reply extract_spec['post_id'] = ('in_reply_to_status_id_str', str) custom_comment['type'] = 'reply' custom_comment['metadata'] = '' else: custom_comment = None if custom_comment: custom_comment = extract_fields(tweet, extract_spec, custom_comment) custom_comment['post_id'] = 'twi_' + custom_comment['post_id'] custom_comment['comment_id'] = 'twi_' + custom_comment['comment_id'] return custom_comment
def get_custom_media(tweet: dict) -> dict: list_media = extract_field(tweet, 'extended_entities:media', list) post_id = 'twi_' + tweet['id_str'] if not len(list_media): return None rets = [] for media in list_media: extract_spec_media = { 'file_url': ('media_url', str), 'video_url': ('video_info:variants', str), } custom_media = extract_fields(media, extract_spec_media) custom_media['post_id'] = post_id custom_media['video_url'] = custom_media['video_url'][-1][ 'url'] if custom_media['video_url'] else '' custom_media['file_path'] = parse.urlsplit( custom_media['file_url'])[2].split(r'/')[-1] custom_media['file_path'] = r'/'.join( [custom_media['file_path'][:4], custom_media['file_path'][4:]]) custom_media['metadata'] = media rets.append(custom_media) return rets
def get_custom_media(post): post_head = 'graphql:shortcode_media' node = extract_field(post, post_head, dict) post_id = extract_field(node, 'id', str) list_media = extract_field(node, 'edge_sidecar_to_children:edges', list) # 무조건 이미지 하나는 포함됨 list_media.append({'node': node}) rets = [] extract_spec = { 'video_url': ('video_url', str), } for media in (x['node'] for x in list_media): custom_media = extract_fields(media, extract_spec) custom_media['file_url'] = media['display_resources'][-1]['src'] custom_media['post_id'] = 'insta_' + post_id custom_media['file_path'] = parse.urlsplit( custom_media['file_url']).path.split(r'/')[-1] custom_media['file_path'] = (lambda s: r'/'.join([s[:4], s[4:]]))( custom_media['file_path']) custom_media['metadata'] = media rets.append(custom_media) return rets
def get_custom_post(post: dict) -> dict: post_head = 'graphql:shortcode_media' node = extract_field(post, post_head, dict) extract_spec = { 'post_id': ('id', str), 'post': ('edge_media_to_caption:edges', dict), 'like_count': ('edge_media_preview_like:count', int), 'post_date': ('taken_at_timestamp', int), 'comment_count': ('edge_media_to_comment:count', int), } custom_post = extract_fields(node, extract_spec) # post 연결 if isinstance(custom_post['post'], list): custom_post['post'] = '\n---\n'.join(x['node']['text'] for x in custom_post['post']) # post에서 hashtag 추출 custom_post['hash_tag'] = (lambda s: s[s.find('#'):].replace('\n', ''))( custom_post['post']) custom_post['url'] = "https://www.instagram.com/p/%s/" % extract_field( node, 'shortcode', str) custom_post['post_id'] = 'insta_' + custom_post['post_id'] custom_post['metadata'] = node return custom_post
def get_custom_post(tweet: dict) -> dict: extract_spec = { 'post_id': ('id_str', str), 'post': ('text', str), 'hash_tag': ('entities:hashtags', str), 'like_count': ('favorite_count', str), 'post_date': ('created_at', str), 'comment_count': ('retweet_count', int), 'extended': ('extended_tweet', bool), } custom_tweet = extract_fields(tweet, extract_spec) custom_tweet['post_id'] = 'twi_' + custom_tweet['post_id'] custom_tweet['url'] = 'https://twitter.com/{user_screenid}/status/{tweet_id}'. \ format_map({'user_screenid': tweet['user']['screen_name'], 'tweet_id': tweet['id_str']}) custom_tweet['extended'] = True if custom_tweet['extended'] else False custom_tweet['metadata'] = tweet return custom_tweet
def get_custom_comment(post): post_head = 'graphql:shortcode_media' node = extract_field(post, post_head, dict) list_comments = extract_field(node, 'edge_media_to_comment:edges', list) if not len(list_comments): return None rets = [] extract_spec = { 'post_id': ('id', str), 'comment_text': ('created_at', int), 'comment_date': ('text', str) } for comment in (x['node'] for x in list_comments): custom_comment = extract_fields(comment, extract_spec) custom_comment['post_id'] = 'insta_' + custom_comment['post_id'] custom_comment['metadata'] = comment rets.append(custom_comment) return rets
meta = extract_field(node, 'metadata', dict) post_id = extract_field(meta, 'id', str) list_media = extract_field(meta, 'edge_sidecar_to_children:edges', list) # 무조건 이미지 하나는 포함됨 list_media.append({'node': meta}) rets = [] extract_spec = { 'video_url': ('video_url', str), } _dup_check = [] for media in (x['node'] for x in list_media): custom_media = extract_fields(media, extract_spec) custom_media['file_url'] = media['display_resources'][-1]['src'] if custom_media['file_url'] in _dup_check: continue custom_media['post_id'] = 'insta_' + post_id custom_media['file_path'] = parse.urlsplit(custom_media['file_url']).path.split(r'/')[-1] custom_media['file_path'] = (lambda s: r'/'.join([s[:4], s[4:]]))(custom_media['file_path']) custom_media['metadata'] = media rets.append(custom_media) json_media.extend(rets) # print(json_media) # print(file_path.split('_')) with open(file_path, 'w', encoding = 'utf-8') as fp: