def ig_login(currentUser): device_id = None try: settings_file = os.path.join( current_app.root_path, 'cookies', currentUser['username']) if not os.path.isfile(settings_file): # settings file does not exist # login new api = Client( currentUser['ig-username'], currentUser['ig-password'], on_login=lambda x: onlogin_callback(x, settings_file)) else: with open(settings_file) as file_data: cached_settings = json.load(file_data, object_hook=from_json) device_id = cached_settings.get('device_id') # reuse auth settings api = Client( currentUser['ig-username'], currentUser['ig-password'], settings=cached_settings) except (ClientCookieExpiredError, ClientLoginRequiredError) as e: print( 'ClientCookieExpiredError/ClientLoginRequiredError: {0!s}'.format(e)) # Login expired # Do relogin but use default ua, keys and such api = Client( currentUser['ig-username'], currentUser['ig-password'], device_id=device_id, on_login=lambda x: onlogin_callback(x, settings_file)) except ClientLoginError as e: print('ClientLoginError {0!s}'.format(e)) exit(9) except ClientError as e: print('ClientError {0!s} (Code: {1:d}, Response: {2!s})'.format( e.msg, e.code, e.error_response)) exit(9) except Exception as e: print('Unexpected Exception: {0!s}'.format(e)) exit(99) # Call the api lst = [] posts = api.feed_timeline() items = [item for item in posts.get('feed_items', []) if item.get('media_or_ad')] tz = pytz.timezone('America/New_York') for item in items: ClientCompatPatch.media(item['media_or_ad']) ig_post = {'Platform': 'Instagram', 'Date': convert_time(tz, item['media_or_ad']['taken_at']), 'Link': 'https://www.instagram.com/p/' + str(item['media_or_ad']['code'])} ig_post_copy = ig_post.copy() lst.append(ig_post_copy) return lst
def login(username, password): try: api = Client(username, password) results = api.feed_timeline() items = results.get('items', []) for item in items: print(item) media = ClientCompatPatch.media(item) print(media['code']) except: messagebox.showinfo( "Hata", "Instagram girişi hatalı lütfen kullanıcı adı şifrenizi kontrol ediniz" )
def instagram_login(): try: passwd=open('data','r').read() except IOError as err: return err try: delay=2 a=Client(username,passwd) r=a.feed_timeline() loads('Finding Password ...') sleep(delay * 7) except: loads('Finding Password ...') sleep(delay*7) print 'Password not found! Try again' else: print '\nlogin as',username,'successfully' print 'Password =>',passwd
def scrape(self): api = Client(self.username, self.password) results = api.feed_timeline() comments_vector = [] #print(results) items = [ item for item in results.get('feed_items', []) if item.get('media_or_ad') ] for item in items: # Manually patch the entity to match the public api as closely as possible, optional # To automatically patch entities, initialise the Client with auto_patch=True ClientCompatPatch.media(item['media_or_ad']) if ('ad_metadata' not in item['media_or_ad'].keys()): # print(item['media_or_ad']['code']) # print("#####################") # print(item['media_or_ad']['caption']['text']) media_id = item['media_or_ad']['caption']['media_id'] comments = api.media_comments(media_id) # print("#####################") # print(comments) for comment in comments['comments']: comment_vector = [] comment_vector.append( item['media_or_ad']['caption']['text']) comment_vector.append(comment['text']) # print(comment['text']) comments_vector.append(comment_vector) return comments_vector
api = Client(user_name, password) # user_feed_info = api.user_feed('329452045', count=1) # print(user_feed_info) # for post in user_feed_info: # print("post",post) # print(f"{post['link']}, {post['user']['username']}") see = api.autocomplete_user_list() print(see) # following = api.user_following('123456','1') # for user in following: # print(user['username']) results = api.feed_timeline() items = [ item for item in results.get('feed_items', []) if item.get('media_or_ad') ] for item in items: # Manually patch the entity to match the public api as closely as possible, optional # To automatically patch entities, initialise the Client with auto_patch=True ClientCompatPatch.media(item['media_or_ad']) print(item['media_or_ad']['code']) # from instagram_web_api import Client, ClientCompatPatch, ClientError, ClientLoginError # # # Without any authentication # web_api = Client(auto_patch=True, drop_incompat_keys=False) # user_feed_info = web_api.user_feed('329452045', count=10) # for post in user_feed_info:
except ClientError as e: print('ClientError {0!s} (Code: {1:d}, Response: {2!s})'.format( e.msg, e.code, e.error_response)) exit(9) except Exception as e: print('Unexpected Exception: {0!s}'.format(e)) exit(99) cookie_expiry = api.cookie_jar.auth_expires print('Cookie Expiry: {0!s}'.format( datetime.datetime.fromtimestamp(cookie_expiry).strftime( '%Y-%m-%dT%H:%M:%SZ'))) rank_token = Client.generate_uuid() tag_results = [] resultsForTimeline = api.feed_timeline() resultsForLikedFeeds = api.feed_liked() apple = json.dumps([resultsForTimeline], indent=100) mango = json.dumps([resultsForLikedFeeds], indent=100) y = json.loads(apple) z = json.loads(mango) stringList = [] for s in y: for e in s['feed_items']: try: stringList.append(e['media_or_ad']['caption']['text']) except: print('')
# next_max_id = results['next_max_id'] # items = [item for item in results.get('feed_items', []) if item.get('media_or_ad')] # for item in items: # # Manually patch the entity to match the public api as closely as possible, optional # # To automatically patch entities, initialise the Client with auto_patch=True # posts_counter += 1 # ClientCompatPatch.media(item['media_or_ad']) # if not item['media_or_ad']['user']['friendship_status']['following']: # ads_counter += 1 # elif item['media_or_ad']['user']['is_verified']: # verified_counter += 1 # print(item['media_or_ad']['user']['username']) next_max_id = '' for i in range(1, 10): # results = api.feed_timeline(seen_posts=watched[:-1]) results = api.feed_timeline(max_id=next_max_id) next_max_id = results['next_max_id'] items = [item for item in results.get('feed_items', []) if item.get('media_or_ad')] for item in items: # Manually patch the entity to match the public api as closely as possible, optional # To automatically patch entities, initialise the Client with auto_patch=True posts_counter += 1 ClientCompatPatch.media(item['media_or_ad']) if not item['media_or_ad']['user']['friendship_status']['following']: ads_counter += 1 elif item['media_or_ad']['user']['is_verified']: verified_counter += 1 #print(item['media_or_ad']['user']['username']) print('stats : ' + str(posts_counter) + ' posts total, from all of them : ' + str(verified_counter + ads_counter) +" are ads or verified " + str(verified_counter) + '(verified) ' + str(ads_counter) + '(ads) ~ ' + str(100*(verified_counter + ads_counter)/posts_counter)+'%')
class InstagramScrape(object): def __init__(self, username, password): print("setting up instagram scraper...") self.username = username self.password = password self.api = Client(username=username, password=password) self.user_id = self.api.username_info(self.username)['user']['pk'] print("instagram scraper successfully initialized!") def get_timeline(self): return self.api.feed_timeline() def get_self_feed(self): result = [] feed = self.api.self_feed()['items'] for entry in feed: if 'carousel_media_count' in list(entry.keys()): media = entry['carousel_media'] else: media = [entry] carousel_parent_id = entry['id'] if not entry['caption'] is None: text = entry['caption']['text'] else: text = '' user_id = entry['caption']["user_id"] username = entry['user']['username'] full_name = entry['user']['full_name'] for medium in media: media_id = medium['id'] images = medium['image_versions2']['candidates'] for i in range(0, len(images)): if not i % 2: url = images[i]["url"] image = { "carousel_parent_id": carousel_parent_id, "id": user_id, "username": username, "full_name": full_name, "media_id": media_id, "url": url, "text": text } result.append(image) return result def get_own_stories(self): result = [] tray = self.api.reels_tray()['tray'][0] user_id = tray['id'] username = tray['user']['username'] full_name = tray['user']['full_name'] stories = tray['items'] for story in stories: url = story['image_versions2']['candidates'][0]['url'] entry = { "user_id": user_id, "username": username, "full_name": full_name, "url": url } result.append(entry) return result def search_results(self, query): result = [] search_results = self.api.search_users(query)['users'] for user in search_results: pk = user['pk'] username = user['username'] full_name = user['full_name'] info = {"id": pk, "username": username, "full_name": full_name} result.append(info) return result def get_user_feed(self, username, id): result = [] try: feed = self.api.username_feed(username)['items'] except ClientError as err: return err for entry in feed: if 'carousel_media_count' in list(entry.keys()): media = entry['carousel_media'] else: media = [entry] carousel_parent_id = entry['id'] if not entry['caption'] is None: text = entry['caption']['text'] else: text = '' username = entry['user']['username'] full_name = entry['user']['full_name'] for medium in media: media_id = medium['id'] images = medium['image_versions2']['candidates'] for i in range(0, len(images)): if not i % 2: url = images[i]["url"] image = { "carousel_parent_id": carousel_parent_id, "user_id": id, "username": username, "full_name": full_name, "media_id": media_id, "url": url, "text": text } result.append(image) return result def gather_media_comments(self, media_id): results = [] comments = self.api.media_comments(media_id)["comments"] for comment in comments: pk = comment['user']['pk'] username = comment['user']['username'] text = comment['text'] result = { 'pk': pk, 'username': username, 'media_id': media_id, 'comment_id': comment['pk'], 'text': text } results.append(result) return results def get_comment_replies(self, media_id, comment_id): results = [] replies = self.api.comment_replies(media_id, comment_id)['child_comments'] for reply in replies: id = reply['user_id'] comment = reply['text'] replier = reply['user']['username'] replied_id = reply['user']['pk'] result = { 'id': id, 'comment': comment, 'from': replier, 'from_id': replied_id, 'media_id': media_id, 'comment_id': comment_id } results.append(result) return results def get_explore_results(self): return self.api.explore() def get_story_archive(self): return self.api.highlight_user_feed(self.user_id) def get_top_search(self): return self.api.top_search() def scrape(self, mental_illnesses): instagram_results = {} for mental_illness in mental_illnesses: instagram_results[mental_illness] = [] print("currently handling Instagram information:") ig_result = self.search_results(mental_illness) handle_feed_lst = {} for handle in ig_result: handle_feed_lst[handle['id']] = [] handle_feed = self.get_user_feed(handle['username'], handle['id']) if type(handle_feed) is not ClientError: handle_feed_lst[handle['id']].extend(handle_feed) users = list(handle_feed_lst.keys()) carousel_ids = [] for user in users: posts = handle_feed_lst[user] for post in posts: print(post) if post['carousel_parent_id'] in carousel_ids: continue else: print(post['carousel_parent_id']) carousel_ids.append(post['carousel_parent_id']) search_result = { 'id': user, 'text': post['text'], 'media': post['url'] } instagram_results[mental_illness].append(search_result) print(instagram_results[mental_illness]) return instagram_results # filename = 'instagram_' + mental_illness + '.json' # with open(filename, 'w', encoding='utf-8') as f: # json.dump(instagram_results, f) # # print("complete handling Instagram information.") def scrape_self(self): results = self.get_self_feed() return results def save_self_scraped_information(self): results = self.scrape_self() filename = 'instagram_' + self.username + '.json' with open(filename, 'w', encoding='utf-8') as f: json.dump(results, f) # Do this: # 1) Detect post. # 2) Caption and picture. # 3) Get the image out. # 4) Process the image # For user posts # 1st run ==> Get last 50 posts if possible. # 2nd run onwards ==> Get last 3 posts. # Run weekly # For user likes # Get liked posts once every 4 hours.
class Instagram: """ Class for collect photo from instagram """ def __init__(self, login, password): self.Source = "Instagram" self.API = Client(login, password) def get_user_video(self, user_id): _first = True count_of_loaded_photo = 0 next_max_id = None while next_max_id or _first: try: _first = False results = self.API.user_feed(user_id=user_id, max_id=next_max_id) for item in results.get('items', []): try: id = item["id"] date = item["caption"]["created_at"] photo_url = item['video_versions'][0]['url'] print('I find video))') except: continue if self._save_photo(photo_url, id, date, extension='mp4'): print('I load video))') db_utils.insert_photo({ "source_id": id, "source": self.Source, "date": date }) count_of_loaded_photo += 1 if count_of_loaded_photo % db_utils.COMMIT_COUNT == 0: db_utils.commit() next_max_id = results.get('next_max_id') except: utils.print_message(traceback.format_exc()) db_utils.commit() return count_of_loaded_photo def get_user_photo(self, user_id): """ Get all user photo :param - user_id: Account id in instagram """ _first = True count_of_loaded_photo = 0 next_max_id = None while next_max_id or _first: try: _first = False results = self.API.user_feed(user_id=user_id, max_id=next_max_id) for item in results.get('items', []): try: id = item["id"] date = item["caption"]["created_at"] photo_url = item["image_versions2"]["candidates"][0][ "url"] except: continue if self._save_photo(photo_url, id, date): db_utils.insert_photo({ "source_id": id, "source": self.Source, "date": date }) count_of_loaded_photo += 1 if count_of_loaded_photo % db_utils.COMMIT_COUNT == 0: db_utils.commit() next_max_id = results.get('next_max_id') except: utils.print_message(traceback.format_exc()) db_utils.commit() return count_of_loaded_photo def get_timeline(self, K): """ Get K photo from feed timeline :param - K: Count of post in timeline """ result = [] _first = True count_of_loaded_photo = 0 next_max_id = None counter = K while (next_max_id or _first) and counter > 0: try: _first = False results = self.API.feed_timeline(max_id=next_max_id) for item in results.get('feed_items', []): try: id = item["media_or_ad"]["id"] date = item["media_or_ad"]["caption"]["created_at"] photo_url = item["media_or_ad"]["image_versions2"][ "candidates"][0]["url"] except: continue if counter <= 0: return counter -= 1 if self._save_photo(photo_url, id, date): db_utils.insert_photo({ "source_id": id, "source": self.Source, "date": date }) count_of_loaded_photo += 1 if count_of_loaded_photo % db_utils.COMMIT_COUNT == 0: db_utils.commit() next_max_id = results.get('next_max_id') except: utils.print_message(traceback.format_exc()) db_utils.commit() return count_of_loaded_photo def get_followings_accounts(self): """ Get followings accounts from current user """ return self.API.user_following( self.API.authenticated_user_id).get('users') def load_all_following_photo(self): """ Download all photo from each following account """ count_of_loaded_photo = 0 for account in self.get_followings_accounts(): count_of_loaded_photo += self.get_user_photo(account['pk']) return count_of_loaded_photo def _save_photo(self, url, source_id, source_time, extension='jpg'): TRY_COUNTS = 3 try_counter = TRY_COUNTS result = False while (try_counter > 0): try: if db_utils.check_exists(source_id, self.Source): break filename = r".\{}\{}.{}".format(self.Source, source_id, extension) p = requests.get(url) if p.status_code == 200: with open(filename, "wb") as f: f.write(p.content) result = True break except: utils.print_message(traceback.format_exc()) try_counter -= 1 return result