def run(): web_api = Client(auto_patch=True, drop_incompat_keys=False) feed = [] startup = True user_dict = { "SpaceX": "20311520", "jclishman.testing": "7400533474" } while True: for id_str in list(user_dict.values()): try: feed.append(web_api.user_feed(id_str, count=1)) time.sleep(5) except Exception as e: #logger.error(str(e)) #logger.error("Error getting feed. Sleeping for 30s") time.sleep(30) for post in feed: post = post[0]["node"] user_id_str = post["owner"]["id"] shortcode = post["shortcode"] timestamp = post["created_time"] # Empty string if there isn't a caption try: caption = post["caption"]["text"] except: caption = '' # Match ID number to screenname for screen_name, id_str in user_dict.items(): if user_id_str == id_str: user_screen_name = screen_name stored_timestamp = db.get_instagram_timestamp(user_screen_name) if int(timestamp) > stored_timestamp: start_time = time.time() db.update_instagram_timestamp(user_screen_name, int(timestamp)) logger.info(f"New Instagram post by @{user_screen_name}, id {user_id_str}") logger.info(f"Post shortcode: {shortcode}") logger.info(f"Post caption: {caption}") logger.info(f"Post timestamp: {timestamp}") url = f"https://instagram.com/p/{shortcode}" if not startup: db.insert_message('Instagram', user_screen_name, caption.replace("\n", " "), url, start_time) time.sleep(10) startup = False
def instagram_feed(user_handle="google", user_id=1067259270): feed_list = [] user_id = user_id #TODO:generate user_id given a user_handle web_api = Client(auto_patch=True, drop_incompat_keys=False) user_feed_info = web_api.user_feed(user_id, count=50) #gets fifty user feeds for feeds in user_feed_info: try: raw_item = feeds["node"] date = datetime.fromtimestamp( int(raw_item.get('taken_at_timestamp'))) feed_info = { "provider": "instagram", "provider_handle": user_handle or '', "link": raw_item["link"] or '', "likes": raw_item["likes"]["count"] or 0, "media": [], "video_views": raw_item.get('video_view_count') or 0, "caption": raw_item["edge_media_to_caption"]["edges"][0]["node"]["text"] or '', } feed_info['pubDate'] = date.strftime( '%a, %d %b %Y %H:%M:%S') + ' GMT' img_link = raw_item.get('display_src') or raw_item.get( 'thumbnail_src') if img_link: feed_info['media'].append(img_link) if raw_item["is_video"]: feed_info["videos"] = raw_item["display_url"] vid_link = feed_info["videos"] if vid_link: feed_info['media'].append(vid_link) feed_list.append(feed_info) except: raise ("Could not get instagram feed or Feed does not exist") return app.response_class(BytesIO(json.dumps(feed_list)), content_type='application/json')
def get_feed(twitter_api): last = src.last.get_last(src.last.PostType.MEDIA) highest = last web_api = Client(auto_patch=True, drop_incompat_keys=False) user_feed = web_api.user_feed(os.getenv('INSTAGRAM_USERID'), count=23) for post in reversed(user_feed): # ID comes in the format 'POSTID_USERID' post_id = int(post['node']['id'].split('_')[0]) # If has not been processed already if post_id > last: # Hashtag tweet_metadata = ['#鈴木このみ', ' '] # Format timestamp timestamp = datetime.datetime.fromtimestamp( post['node']['taken_at_timestamp'], pytz.timezone('Asia/Tokyo')) tweet_metadata += [timestamp.strftime('%Y-%m-%d %H:%M'), '\n'] # Post URL tweet_metadata.append(post['node']['link']) # Caption caption = post['node']['caption']['text'] tweet_content = ['\n\n', caption] media = [] # List of tuples of (type, url) if post['node']['__typename'] == MediaType.GALLERY.value: list_idx = 0 list_type = None media_list = [] for gallery_item in post['node']['edge_sidecar_to_children'][ 'edges']: if gallery_item['node'][ '__typename'] == MediaType.VIDEO.value: if list_type is None: media.append([gallery_item['node']['video_url']]) elif list_type is MediaType.IMAGE: # Image list in progress # Commit current list and create new list with video media.append(media_list) media.append([gallery_item['node']['video_url']]) list_type = None media_list = [] else: if list_type is None: # No list in progress list_type = MediaType.IMAGE media_list.append( gallery_item['node']['display_url']) elif list_type is MediaType.IMAGE: # Image list in progress if len(media_list) > 4: # List is somehow overfull # Tweets only allow 4 images, so extra ones need to be split while len(media_list) >= 4: media.append(media_list[:4]) media_list = media_list[4:] media_list.append( gallery_item['node']['display_url']) elif len(media_list) == 4: # List full # Commit current list and create new list media.append(media_list) media_list = [ gallery_item['node']['display_url'] ] else: # List not full yet media_list.append( gallery_item['node']['display_url']) # Commit unfinished list if exists if list_type is MediaType.IMAGE and len(media_list) > 0: media.append(media_list) elif post['node']['__typename'] == MediaType.VIDEO.value: media.append([post['node']['video_url']]) else: media.append([post['node']['display_url']]) tweet_str = twutils.truncate_status(''.join(tweet_metadata + tweet_content)) prev_status = 0 for tweet_media in media: replyto = None if (prev_status > 0): tweet_str = twutils.truncate_status( ''.join(tweet_metadata)) replyto = prev_status if os.getenv('ENV', 'dev') == 'production': prev_status = twitter_api.PostUpdate( tweet_str, tweet_media, in_reply_to_status_id=prev_status).id else: prev_status += 1 twitter_api.write(tweet_str + '\n\n') twitter_api.write('\n'.join(tweet_media) + '\n\n') # Update highest ID if higher if post_id > highest: highest = post_id if (highest > last): src.last.set_last(str(highest), src.last.PostType.MEDIA)
new_str = new_str + doc ret = new_str.strip() ret = re.sub("[\n,/,\\\]", "", ret) # ret = ret.replace(" ", " ") return ret if __name__ == '__main__': if len(sys.argv) < 2: print('Usage: python3 get_feed.py [query] ') sys.exit(0) query = sys.argv[1] api = Client(auto_patch=True, drop_incompat_keys=False) user_feed = api.user_feed(query, count=10) feed_data = { "username": user_feed[0]["node"]["owner"]["username"], "user_id": user_feed[0]["node"]["owner"]["id"], "media": [] } for photo in user_feed: media_shortcode = photo["node"]["shortcode"] data = photo["node"] media_data = { "media_shortcode": media_shortcode, "photo_url": data["display_url"],
class InstagramSession(session.StreamSession): BATCH_COUNT = 25 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) with warnings.catch_warnings(): warnings.simplefilter("ignore") self.web_api = Client( proxy=self.proxies.get("https") if self.proxies else None, auto_patch=True, drop_incompat_keys=False) self.end_cursors = DefaultAttrDict(lambda: None) @memo(region="long") def user_name_to_id(self, user_name): try: user_id = self.web_api.user_info2(user_name)["id"] except: raise SGException(f"user id for {user_name} not found") return user_id def get_feed_items(self, user_name, count=BATCH_COUNT): try: feed = self.web_api.user_feed( self.user_name_to_id(user_name), count=self.BATCH_COUNT, end_cursor=self.end_cursors[user_name]) except ClientConnectionError as e: logger.warn(f"connection error: {e}") for post in feed: try: cursor = (post["node"]["edge_media_to_comment"]["page_info"] ["end_cursor"]) if cursor: self.end_cursors[user_name] = cursor except KeyError: pass post_type = None post_id = post["node"]["id"] try: title = post["node"]["caption"]["text"].replace("\n", "") except TypeError: title = "(no caption)" media_type = post["node"]["type"] if media_type == "video": post_type = "video" # content = InstagramMediaSource(post["node"]["link"], media_type="video") # content = InstagramMediaSource(post["node"]["videos"]["standard_resolution"]["url"], media_type="video") content = self.provider.new_media_source( post["node"]["videos"]["standard_resolution"]["url"], media_type="video") elif media_type == "image": if "carousel_media" in post["node"]: post_type = "story" content = [ # InstagramMediaSource(m["images"]["standard_resolution"]["url"], media_type="image") self.provider.new_media_source( m["images"]["standard_resolution"]["url"], media_type="image") if m["type"] == "image" else # InstagramMediaSource(m["video_url"], media_type="video") self.provider.new_media_source(m["video_url"], media_type="video") if m["type"] == "video" else None for m in post["node"]["carousel_media"] ] else: post_type = "image" # content = InstagramMediaSource(post["node"]["images"]["standard_resolution"]["url"], media_type="image") content = self.provider.new_media_source( post["node"]["images"]["standard_resolution"]["url"], media_type="image") # raise Exception else: logger.warn(f"no content for post {post_id}") continue yield (AttrDict(guid=post_id, title=title.strip(), post_type=post_type, created=datetime.fromtimestamp( int(post["node"]["created_time"])), content=content))
from instagram_web_api import Client, ClientCompatPatch, ClientError, ClientLoginError from sightengine.client import SightengineClient client = SightengineClient('630881392', 'St5TPUomwvLYq7eiXd4G') web_api = Client(auto_patch=True, drop_incompat_keys=False) user_feed_info = web_api.user_feed('232192182') def checkDrugs(my_url): output = client.check('wad').set_url(my_url) drugs = output['drugs'] return drugs def checkWeapons(my_url): output = client.check('wad').set_url(my_url) weapons = output['weapon'] return weapons def checkAlcohol(my_url): output = client.check('wad').set_url(my_url) alcohol = output['alcohol'] return alcohol def checkCaption(my_url): output = client.check('wad').set_url(my_url) print(output)
class InstagramCrawler: api = None user_id = '327416611' social_endpoint = None def __init__(self): self.api = Client(auto_patch=True, drop_incompat_keys=False) self.social_endpoint = os.getenv('SOCIAL_ENDPOINT', 'http://localhost:8080') def fetch(self, end_cursor=None): result = self.api.user_feed(self.user_id, count=50, extract=False, end_cursor=end_cursor) info = self.parse_http_result(result) for post in info['posts']: self.process_post(post) page_info = info['page_info'] if page_info.get('has_next_page', False): time.sleep(2) self.fetch(page_info['end_cursor']) def parse_http_result(self, result): status = result.get('status', 'error') if status != 'ok': sys.exit('api response not ok') data = result['data'] media = data['user']['edge_owner_to_timeline_media'] return { 'count': media['count'], 'posts': [edge['node'] for edge in media['edges']], 'page_info': media['page_info'] } def process_post(self, post): payload = self.parse_post(post) r = requests.put('%s/instagram' % self.social_endpoint, json=payload) if r.status_code != 200: sys.exit(r.text) def parse_post(self, post): text_edges = post['edge_media_to_caption']['edges'] if not text_edges: caption = '' tags = [] else: text = text_edges[0]['node']['text'] text = re.sub('\s+', ' ', text) text = re.sub('\.\s+', '', text) tags = list({ tag.strip().lower() for tag in re.findall('(?<=#)[^# ]+(?=#|$| )', text) }) caption = re.sub('(#[^# ]+ )*(#[^# ]+$)', '', text) return { 'shortcode': post['shortcode'], 'caption': caption, 'tags': tags, 'likes': post['likes']['count'], 'comments': post['comments']['count'], 'type': post['type'], 'thumbnail': post['images']['thumbnail']['url'], 'image': post['images']['standard_resolution']['url'], 'timestamp': datetime.utcfromtimestamp(int(post['created_time'])).replace( tzinfo=timezone.utc).isoformat() }