def get_tags(browser, url): """Gets all the tags of the given description in the url""" # Check URL of the webpage, if it already is the one to be navigated, # then do not navigate to it again web_address_navigator(browser, url) try: browser.execute_script( "window.insta_data = window.__additionalData[Object.keys(window.__additionalData)[0]].data" ) except WebDriverException: browser.execute_script( "window.insta_data = window._sharedData.entry_data.PostPage[0]" ) graphql = browser.execute_script("return ('graphql' in window.insta_data)") if graphql: image_text = browser.execute_script( "return window.insta_data.graphql." "shortcode_media.edge_media_to_caption.edges[0].node.text" ) else: image_text = browser.execute_script( "return window.insta_data.media.caption.text" ) tags = findall(r"#\w*", image_text) return tags
def fetch_connection_ip(browser): try: # trust proxy string if a proxy is provided proxy_str = _session.proxy_string return re.search(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", proxy_str).group(0) except: # fetch current ip from third-party website web_address_navigator(browser, ip_address_check_url) return browser.find_element_by_tag_name("pre").text
def test_connection(browser): try: fetch_ip = fetch_connection_ip(browser) fetch_instagram_data = None if fetch_ip: web_address_navigator(browser, instagram_test_url) fetch_instagram_data = browser.page_source return {"ip": fetch_ip, "instagramResponse": fetch_instagram_data} except Exception as e: raise e
def nf_go_to_follow_page(self, which: str, username: str): # TODO: do it naturally try: follow_which_button = self.browser.find_element_by_xpath( '//a[@href="/{}/{}/"]'.format(username, which)) nf_scroll_into_view(self, follow_which_button) nf_click_center_of_element(self, follow_which_button) except NoSuchElementException: self.logger.warning("Failed to get {} page button".format(which)) sleep(2) follow_link = "https://www.instagram.com/{}/{}/".format(username, which) if not check_if_in_correct_page(self, follow_link): self.logger.error( "Failed to go to {} page, navigating there".format(which)) # TODO: retry to get there naturally web_address_navigator(self.browser, follow_link)
def nf_go_from_post_to_profile(self, username: str): try: username_button = self.browser.find_element_by_xpath( '/html/body/div[1]/section/main/div/div/article/header//div[@class="e1e1d"]' ) nf_scroll_into_view(self, username_button) nf_click_center_of_element(self, username_button) except NoSuchElementException: self.logger.warning("Failed to get user page button") sleep(1) user_link = "https://www.instagram.com/{}/".format(username) if not check_if_in_correct_page(self, user_link): self.logger.error("Failed to go to user page, navigating there") # TODO: retry to get there naturally web_address_navigator(self.browser, user_link)
def nf_go_to_user_page(self, username: str): """Navigates to the provided user page by typing its name on explore""" try: nf_type_on_explore(self, username) sleep(2) # click tag user_option = self.browser.find_element_by_xpath( '//a[@href="/{}/"]'.format(username)) nf_click_center_of_element(self, user_option) except NoSuchElementException: self.logger.warning("Failed to go to get a page element") sleep(1) user_link = "https://www.instagram.com/{}/".format(username) if not check_if_in_correct_page(self, user_link): self.logger.error("Failed to go to user page, navigating there") # TODO: retry to get there naturally web_address_navigator(self.browser, user_link)
def nf_go_to_tag_page(self, tag: str): """Navigates to the provided tag page by typing it on explore""" try: nf_type_on_explore(self, "#" + tag) sleep(2) # click tag tag_option = self.browser.find_element_by_xpath( '//a[@href="/explore/tags/{}/"]'.format(tag)) # self.browser.execute_script("arguments[0].click();", tag_option) nf_click_center_of_element(self, tag_option) except NoSuchElementException: self.logger.warning("Failed to get a page element") sleep(1) tag_link = "https://www.instagram.com/explore/tags/{}/".format(tag) if not check_if_in_correct_page(self, tag_link): self.logger.error("Failed to go to tag page, navigating there") # TODO: retry to get there naturally web_address_navigator(self.browser, tag_link)
def nf_find_and_press_back(self, link: str): """Finds and press back button""" possibles = [ '/html/body/div[1]/section/nav[1]/div/header//a[@class=" Iazdo"]', '/html/body/div[1]/section/nav[1]/div/header//a[@class="Iazdo"]', '/html/body/div[1]/section/nav[1]/div/header//a//*[name()="svg"][@class="_8-yf5 "]', '/html/body/div[1]/section/nav[1]/div/header//a//*[name()="svg"][@class="_8-yf5"]', '/html/body/div[1]/section/nav[1]/div/header//a//*[name()="svg"][@aria-label="Back"]', '/html/body/div[1]/section/nav[1]/div/header//a/span/*[name()="svg"][@class="_8-yf5 "]', '/html/body/div[1]/section/nav[1]/div/header//a/span/*[name()="svg"][@class="_8-yf5"]', '/html/body/div[1]/section/nav[1]/div/header//a/span/*[name()="svg"][@aria-label="Back"]', ] success = False back_path = "" for back_path in possibles: if not success: try: back = self.browser.find_element_by_xpath(back_path) nf_scroll_into_view(self, back) nf_click_center_of_element(self, back) self.browser.execute_script("arguments[0].click();", back) success = True break except NoSuchElementException: success = False # self.logger.warning("Failed to get back button with xpath:\n{}".format(back_path)) if not success: self.logger.warning("Failed to get back button with all xpaths") else: self.logger.info( "Pressed back button with xpath:\n {}".format(back_path)) sleep(3) if not check_if_in_correct_page(self, link): self.logger.error("Failed to go back, navigating there") # TODO: retry to get there naturally web_address_navigator(self.browser, link)
def db_store_comments(self, posts: List[Post], post_link: str): """Stores all comments of open post then goes back to post page""" try: comments_button = self.browser.find_elements_by_xpath( '//article//div[2]/div[1]//a[contains(@href,"comments")]') if comments_button: nf_scroll_into_view(self, comments_button[0]) nf_click_center_of_element(self, comments_button[0]) sleep(2) comments_link = post_link + 'comments/' if not check_if_in_correct_page(self, comments_link): self.logger.error( "Failed to go to comments page, navigating there") # TODO: retry to get there naturally web_address_navigator(self.browser, comments_link) more_comments = self.browser.find_elements_by_xpath( '//span[@aria-label="Load more comments"]') counter = 1 while more_comments and counter <= 10: self.logger.info("Loading comments ({}/10)...".format(counter)) nf_scroll_into_view(self, more_comments[0]) self.browser.execute_script("arguments[0].click();", more_comments[0]) more_comments = self.browser.find_elements_by_xpath( '//span[@aria-label="Load more comments"]') counter += 1 comments = self.browser.find_elements_by_xpath( '/html/body/div[1]/section/main/div/ul/ul[@class="Mr508"]') for comment in comments: inner_container = comment.find_element_by_xpath( './/div[@class="C4VMK"]') username = inner_container.find_element_by_xpath( './/h3/div/a').text text, _ = deform_emojis( inner_container.find_element_by_xpath('.//span').text) post_date = inner_container.find_element_by_xpath( './/time').get_attribute('datetime') post_date = datetime.fromisoformat(post_date[:-1]) user = db_get_or_create_user(self, username) self.db.session.add(user) self.db.session.commit() for post in posts: comment = Comment( date_posted=post_date, text=text, user=user, post=post, ) self.db.session.add(comment) self.db.session.commit() else: self.logger.error("No comments found") except SQLAlchemyError: self.db.session.rollback() raise finally: self.db.session.commit() nf_find_and_press_back(self, post_link)
def check_link2( browser, post_link, dont_like, mandatory_words, mandatory_language, mandatory_character, is_mandatory_character, check_character_set, ignore_if_contains, logger, ): """ Check the given link if it is appropriate :param browser: The selenium webdriver instance :param post_link: :param dont_like: hashtags of inappropriate phrases :param mandatory_words: words of appropriate phrases :param ignore_if_contains: :param logger: the logger instance :return: tuple of boolean: True if inappropriate, string: the username, integer: number of likes, integer: number of comments, posting_date_str: string, location_name: string, image_text: string, boolean: True if it is video media, string: the message if inappropriate else 'None', string: set the scope of the return value """ # Check URL of the webpage, if it already is post's page, then do not # navigate to it again web_address_navigator(browser, post_link) # Check if the Post is Valid/Exists try: post_page = browser.execute_script( "return window.__additionalData[Object.keys(window.__additionalData)[0]].data" ) except WebDriverException: # handle the possible `entry_data` error try: browser.execute_script("location.reload()") update_activity(browser, state=None) post_page = browser.execute_script( "return window._sharedData.entry_data.PostPage[0]" ) except WebDriverException: post_page = None if post_page is None: logger.warning("Unavailable Page: {}".format(post_link.encode("utf-8"))) return True, None, None, None, None, None, None, None, "Unavailable Page", "Failure" web_address_navigator(browser, post_link) likes_count = get_likes(browser,logger) try: comments_count,comments_status = get_comments_count(browser,logger) except: comments_count = None comments_status = comments_status time_element = browser.find_element_by_xpath("//div/a/time") posting_datetime_str = time_element.get_attribute("datetime") # Gets the description of the post's link and checks for the dont_like tags graphql = "graphql" in post_page if graphql: media = post_page["graphql"]["shortcode_media"] is_video = media["is_video"] user_name = media["owner"]["username"] image_text = media["edge_media_to_caption"]["edges"] image_text = image_text[0]["node"]["text"] if image_text else None location = media["location"] location_name = location["name"] if location else None media_edge_string = get_media_edge_comment_string(media) # double {{ allows us to call .format here: try: browser.execute_script( "window.insta_data = window.__additionalData[Object.keys(window.__additionalData)[0]].data" ) except WebDriverException: browser.execute_script( "window.insta_data = window._sharedData.entry_data.PostPage[0]" ) owner_comments = browser.execute_script( """ latest_comments = window.insta_data.graphql.shortcode_media.{}.edges; if (latest_comments === undefined) {{ latest_comments = Array(); owner_comments = latest_comments .filter(item => item.node.owner.username == arguments[0]) .map(item => item.node.text) .reduce((item, total) => item + '\\n' + total, ''); return owner_comments;}} else {{ return null;}} """.format( media_edge_string ), user_name, ) else: media = post_page[0]["shortcode_media"] is_video = media["is_video"] user_name = media["owner"]["username"] image_text = media["caption"] owner_comments = browser.execute_script( """ latest_comments = window._sharedData.entry_data.PostPage[ 0].media.comments.nodes; if (latest_comments === undefined) { latest_comments = Array(); owner_comments = latest_comments .filter(item => item.user.username == arguments[0]) .map(item => item.text) .reduce((item, total) => item + '\\n' + total, ''); return owner_comments;} else { return null;} """, user_name, ) if owner_comments == "": owner_comments = None # Append owner comments to description as it might contain further tags if image_text is None: image_text = owner_comments elif owner_comments: image_text = image_text + "\n" + owner_comments # If the image still has no description gets the first comment if image_text is None: if graphql: media_edge_string = get_media_edge_comment_string(media) image_text = media[media_edge_string]["edges"] image_text = image_text[0]["node"]["text"] if image_text else None else: image_text = media["comments"]["nodes"] image_text = image_text[0]["text"] if image_text else None if image_text is None: image_text = "No description" logger.info("-Posted by: {} image likes: {} image comments: {}".format(user_name.encode("utf-8"),likes_count,comments_count)) # logger.info("-Link: {}".format(post_link.encode("utf-8"))) # logger.info("Description: {}".format(image_text.encode("utf-8"))) # logger.info("-Posted date: {}:".format(posting_datetime_str)) # logger.info("-Likes: {}".format(likes_count)) # logger.info("-Comments: {}".format(comments_count)) # Check if mandatory character set, before adding the location to the text if mandatory_language: if not check_character_set(image_text): return ( True, user_name, likes_count, comments_count, posting_datetime_str, location_name, image_text, is_video, "Mandatory language not " "fulfilled", "Not mandatory " "language", ) # Append location to image_text so we can search through both in one go if location_name: logger.info("-Location: {}".format(location_name.encode("utf-8"))) image_text = image_text + "\n" + location_name if mandatory_words: if not any((word in image_text for word in mandatory_words)): return ( True, user_name, likes_count, comments_count, posting_datetime_str, location_name, image_text, is_video, "Mandatory words not " "fulfilled", "Not mandatory " "likes", ) image_text_lower = [x.lower() for x in image_text] ignore_if_contains_lower = [x.lower() for x in ignore_if_contains] if any((word in image_text_lower for word in ignore_if_contains_lower)): return False, user_name, likes_count, comments_count, posting_datetime_str,location_name,image_text,is_video, "None", "Pass" dont_like_regex = [] for dont_likes in dont_like: if dont_likes.startswith("#"): dont_like_regex.append(dont_likes + r"([^\d\w]|$)") elif dont_likes.startswith("["): dont_like_regex.append("#" + dont_likes[1:] + r"[\d\w]+([^\d\w]|$)") elif dont_likes.startswith("]"): dont_like_regex.append(r"#[\d\w]+" + dont_likes[1:] + r"([^\d\w]|$)") else: dont_like_regex.append(r"#[\d\w]*" + dont_likes + r"[\d\w]*([^\d\w]|$)") for dont_likes_regex in dont_like_regex: quash = re.search(dont_likes_regex, image_text, re.IGNORECASE) if quash: quashed = ( (((quash.group(0)).split("#")[1]).split(" ")[0]) .split("\n")[0] .encode("utf-8") ) # dismiss possible space and newlines iffy = ( (re.split(r"\W+", dont_likes_regex))[3] if dont_likes_regex.endswith("*([^\\d\\w]|$)") else (re.split(r"\W+", dont_likes_regex))[1] # 'word' without format if dont_likes_regex.endswith("+([^\\d\\w]|$)") else (re.split(r"\W+", dont_likes_regex))[3] # '[word' if dont_likes_regex.startswith("#[\\d\\w]+") else (re.split(r"\W+", dont_likes_regex))[1] # ']word' ) # '#word' inapp_unit = 'Inappropriate! ~ contains "{}"'.format( quashed if iffy == quashed else '" in "'.join([str(iffy), str(quashed)]) ) return True, user_name, likes_count, comments_count, posting_datetime_str, location_name, image_text,is_video, inapp_unit, "Undesired word" return False, user_name, likes_count, comments_count, posting_datetime_str, location_name, image_text, is_video, "None", "Success"
def get_links_for_username( browser, username, person, amount, logger, logfolder, randomize=False, media=None, taggedImages=False, imageToFind=None ): """ Fetches the number of links specified by amount and returns a list of links """ if media is None: # All known media types media = MEDIA_ALL_TYPES elif media == MEDIA_PHOTO: # Include posts with multiple images in it media = [MEDIA_PHOTO, MEDIA_CAROUSEL] else: # Make it an array to use it in the following part media = [media] logger.info("Getting {} image list...".format(person)) user_link = "https://www.instagram.com/{}/".format(person) if taggedImages: user_link = user_link + "tagged/" # Check URL of the webpage, if it already is user's profile page, # then do not navigate to it again web_address_navigator(browser, user_link) if not is_page_available(browser, logger): logger.error( "Instagram error: The link you followed may be broken, or the " "page may have been removed..." ) return False # if private user, we can get links only if we following following_status, _ = get_following_status( browser, "profile", username, person, None, logger, logfolder ) # if following_status is None: # browser.wait_for_valid_connection(browser, username, logger) # if following_status == 'Follow': # browser.wait_for_valid_authorization(browser, username, logger) is_private = is_private_profile(browser, logger, following_status == "Following") if ( is_private is None or (is_private is True and following_status not in ["Following", True]) or (following_status == "Blocked") ): logger.info("This user is private and we are not following") return False web_address_navigator(browser, user_link) # Get links links = [] main_elem = browser.find_element_by_tag_name("article") posts_count = get_number_of_posts(browser) attempt = 0 if posts_count is not None and amount > posts_count: logger.info( "You have requested to get {} posts from {}'s profile page BUT" " there only {} posts available :D".format(amount, person, posts_count) ) amount = posts_count while len(links) < amount: initial_links = links sleep(1.25) browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") # update server calls after a scroll request update_activity(browser, state=None) # using `extend` or `+=` results reference stay alive which affects # previous assignment (can use `copy()` for it) main_elem = browser.find_element_by_tag_name("article") tempLinks = get_links(browser, person, logger, media, main_elem) links = links + tempLinks links = sorted(set(links), key=links.index) if len(tempLinks)>0 and (imageToFind is not None) and (imageToFind in tempLinks): break if len(links) == len(initial_links): logger.info("Pausing 45s during scroll for new links. Currently at: {} of {} (attempt:{})".format(len(links),amount,attempt)) sleep(45.0) if attempt >= 60: logger.info( "There are possibly less posts than {} in {}'s profile " "page!".format(amount, person) ) break else: attempt += 1 else: attempt = 0 if randomize is True: random.shuffle(links) return links[:amount]
def delta_followers(session, user_name, max_amount, past_followers=None): """ Given an instagram username and an optional list of past_followers, retrieves the list of new followers. :param session: :param user_name: :param max_amount: :param old_followers: :return: """ if past_followers is None: past_followers = [] session.quotient_breach = False try: user_name = user_name.strip() user_link = "https://www.instagram.com/{}/".format(user_name) web_address_navigator(session.browser, user_link) if not is_page_available(browser=session.browser, logger=session.logger): return [] # check how many people are following this user. allfollowers, allfollowing = get_relationship_counts( browser=session.browser, username=user_name, logger=session.logger) # print(allfollowers) # print(allfollowing) # skip early for no followers if not allfollowers: session.logger.info("'{}' has no followers".format(user_name)) return [] elif allfollowers < max_amount: session.logger.warning( "'{}' has less followers- {}, than the given amount of {}". format(user_name, allfollowers, max_amount)) # locate element to user's followers try: followers_link = session.browser.find_elements_by_xpath( '//a[@href="/{}/followers/"]'.format(user_name)) if len(followers_link) > 0: click_element(session.browser, followers_link[0]) else: session.logger.error("'{} is private'".format(user_name)) return [] except NoSuchElementException: session.logger.error( 'Could not find followers\' link for {}'.format(user_name)) return [] except BaseException as e: session.logger.error("`followers_link` error {}".format(str(e))) return [] person_list, _ = get_users_through_dialog( browser=session.browser, login=session.username, user_name=user_name, amount=max_amount, users_count=allfollowers, randomize=False, dont_include=[], blacklist=session.blacklist, follow_times=session.follow_times, simulation={ "enabled": False, "percentage": 100 }, channel="Follow", jumps=session.jumps, logger=session.logger, logfolder=session.logfolder, past_followers=past_followers, wait_seconds=10, ) except (TypeError, RuntimeWarning) as err: session.logger.error('Sorry, an error occurred: {}'.format(err)) session.aborting = True return [] return person_list