Ejemplo n.º 1
0
def get_tags(browser, url):
    """Gets all the tags of the given description in the url"""

    # Check URL of the webpage, if it already is the one to be navigated,
    # then do not navigate to it again
    web_address_navigator(browser, url)

    try:
        browser.execute_script(
            "window.insta_data = window.__additionalData[Object.keys(window.__additionalData)[0]].data"
        )
    except WebDriverException:
        browser.execute_script(
            "window.insta_data = window._sharedData.entry_data.PostPage[0]"
        )

    graphql = browser.execute_script("return ('graphql' in window.insta_data)")

    if graphql:
        image_text = browser.execute_script(
            "return window.insta_data.graphql."
            "shortcode_media.edge_media_to_caption.edges[0].node.text"
        )

    else:
        image_text = browser.execute_script(
            "return window.insta_data.media.caption.text"
        )

    tags = findall(r"#\w*", image_text)

    return tags
Ejemplo n.º 2
0
def fetch_connection_ip(browser):
    try:
        # trust proxy string if a proxy is provided
        proxy_str = _session.proxy_string
        return re.search(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",
                         proxy_str).group(0)
    except:
        # fetch current ip from third-party website
        web_address_navigator(browser, ip_address_check_url)
        return browser.find_element_by_tag_name("pre").text
Ejemplo n.º 3
0
def test_connection(browser):
    try:
        fetch_ip = fetch_connection_ip(browser)
        fetch_instagram_data = None

        if fetch_ip:
            web_address_navigator(browser, instagram_test_url)
            fetch_instagram_data = browser.page_source
        return {"ip": fetch_ip, "instagramResponse": fetch_instagram_data}
    except Exception as e:
        raise e
Ejemplo n.º 4
0
def nf_go_to_follow_page(self, which: str, username: str):
    # TODO: do it naturally
    try:
        follow_which_button = self.browser.find_element_by_xpath(
            '//a[@href="/{}/{}/"]'.format(username, which))
        nf_scroll_into_view(self, follow_which_button)
        nf_click_center_of_element(self, follow_which_button)
    except NoSuchElementException:
        self.logger.warning("Failed to get {} page button".format(which))
    sleep(2)
    follow_link = "https://www.instagram.com/{}/{}/".format(username, which)
    if not check_if_in_correct_page(self, follow_link):
        self.logger.error(
            "Failed to go to {} page, navigating there".format(which))
        # TODO: retry to get there naturally
        web_address_navigator(self.browser, follow_link)
Ejemplo n.º 5
0
def nf_go_from_post_to_profile(self, username: str):
    try:
        username_button = self.browser.find_element_by_xpath(
            '/html/body/div[1]/section/main/div/div/article/header//div[@class="e1e1d"]'
        )
        nf_scroll_into_view(self, username_button)
        nf_click_center_of_element(self, username_button)
    except NoSuchElementException:
        self.logger.warning("Failed to get user page button")

    sleep(1)
    user_link = "https://www.instagram.com/{}/".format(username)
    if not check_if_in_correct_page(self, user_link):
        self.logger.error("Failed to go to user page, navigating there")
        # TODO: retry to get there naturally
        web_address_navigator(self.browser, user_link)
Ejemplo n.º 6
0
def nf_go_to_user_page(self, username: str):
    """Navigates to the provided user page by typing its name on explore"""
    try:
        nf_type_on_explore(self, username)
        sleep(2)
        # click tag
        user_option = self.browser.find_element_by_xpath(
            '//a[@href="/{}/"]'.format(username))
        nf_click_center_of_element(self, user_option)
    except NoSuchElementException:
        self.logger.warning("Failed to go to get a page element")

    sleep(1)
    user_link = "https://www.instagram.com/{}/".format(username)
    if not check_if_in_correct_page(self, user_link):
        self.logger.error("Failed to go to user page, navigating there")
        # TODO: retry to get there naturally
        web_address_navigator(self.browser, user_link)
Ejemplo n.º 7
0
def nf_go_to_tag_page(self, tag: str):
    """Navigates to the provided tag page by typing it on explore"""
    try:
        nf_type_on_explore(self, "#" + tag)
        sleep(2)
        # click tag
        tag_option = self.browser.find_element_by_xpath(
            '//a[@href="/explore/tags/{}/"]'.format(tag))
        # self.browser.execute_script("arguments[0].click();", tag_option)
        nf_click_center_of_element(self, tag_option)
    except NoSuchElementException:
        self.logger.warning("Failed to get a page element")

    sleep(1)
    tag_link = "https://www.instagram.com/explore/tags/{}/".format(tag)
    if not check_if_in_correct_page(self, tag_link):
        self.logger.error("Failed to go to tag page, navigating there")
        # TODO: retry to get there naturally
        web_address_navigator(self.browser, tag_link)
Ejemplo n.º 8
0
def nf_find_and_press_back(self, link: str):
    """Finds and press back button"""
    possibles = [
        '/html/body/div[1]/section/nav[1]/div/header//a[@class=" Iazdo"]',
        '/html/body/div[1]/section/nav[1]/div/header//a[@class="Iazdo"]',
        '/html/body/div[1]/section/nav[1]/div/header//a//*[name()="svg"][@class="_8-yf5 "]',
        '/html/body/div[1]/section/nav[1]/div/header//a//*[name()="svg"][@class="_8-yf5"]',
        '/html/body/div[1]/section/nav[1]/div/header//a//*[name()="svg"][@aria-label="Back"]',
        '/html/body/div[1]/section/nav[1]/div/header//a/span/*[name()="svg"][@class="_8-yf5 "]',
        '/html/body/div[1]/section/nav[1]/div/header//a/span/*[name()="svg"][@class="_8-yf5"]',
        '/html/body/div[1]/section/nav[1]/div/header//a/span/*[name()="svg"][@aria-label="Back"]',
    ]
    success = False
    back_path = ""
    for back_path in possibles:
        if not success:
            try:
                back = self.browser.find_element_by_xpath(back_path)
                nf_scroll_into_view(self, back)
                nf_click_center_of_element(self, back)
                self.browser.execute_script("arguments[0].click();", back)
                success = True
                break
            except NoSuchElementException:
                success = False
                # self.logger.warning("Failed to get back button with xpath:\n{}".format(back_path))

    if not success:
        self.logger.warning("Failed to get back button with all xpaths")
    else:
        self.logger.info(
            "Pressed back button with xpath:\n     {}".format(back_path))

    sleep(3)
    if not check_if_in_correct_page(self, link):
        self.logger.error("Failed to go back, navigating there")
        # TODO: retry to get there naturally
        web_address_navigator(self.browser, link)
Ejemplo n.º 9
0
def db_store_comments(self, posts: List[Post], post_link: str):
    """Stores all comments of open post then goes back to post page"""
    try:
        comments_button = self.browser.find_elements_by_xpath(
            '//article//div[2]/div[1]//a[contains(@href,"comments")]')
        if comments_button:
            nf_scroll_into_view(self, comments_button[0])
            nf_click_center_of_element(self, comments_button[0])
            sleep(2)
            comments_link = post_link + 'comments/'
            if not check_if_in_correct_page(self, comments_link):
                self.logger.error(
                    "Failed to go to comments page, navigating there")
                # TODO: retry to get there naturally
                web_address_navigator(self.browser, comments_link)
            more_comments = self.browser.find_elements_by_xpath(
                '//span[@aria-label="Load more comments"]')
            counter = 1
            while more_comments and counter <= 10:
                self.logger.info("Loading comments ({}/10)...".format(counter))
                nf_scroll_into_view(self, more_comments[0])
                self.browser.execute_script("arguments[0].click();",
                                            more_comments[0])
                more_comments = self.browser.find_elements_by_xpath(
                    '//span[@aria-label="Load more comments"]')
                counter += 1

            comments = self.browser.find_elements_by_xpath(
                '/html/body/div[1]/section/main/div/ul/ul[@class="Mr508"]')
            for comment in comments:
                inner_container = comment.find_element_by_xpath(
                    './/div[@class="C4VMK"]')
                username = inner_container.find_element_by_xpath(
                    './/h3/div/a').text
                text, _ = deform_emojis(
                    inner_container.find_element_by_xpath('.//span').text)
                post_date = inner_container.find_element_by_xpath(
                    './/time').get_attribute('datetime')
                post_date = datetime.fromisoformat(post_date[:-1])

                user = db_get_or_create_user(self, username)
                self.db.session.add(user)
                self.db.session.commit()

                for post in posts:
                    comment = Comment(
                        date_posted=post_date,
                        text=text,
                        user=user,
                        post=post,
                    )
                    self.db.session.add(comment)
                    self.db.session.commit()
        else:
            self.logger.error("No comments found")
    except SQLAlchemyError:
        self.db.session.rollback()
        raise
    finally:
        self.db.session.commit()
        nf_find_and_press_back(self, post_link)
Ejemplo n.º 10
0
def check_link2(
    browser,
    post_link,
    dont_like,
    mandatory_words,
    mandatory_language,
    mandatory_character,
    is_mandatory_character,
    check_character_set,
    ignore_if_contains,
    logger,
):
    """
    Check the given link if it is appropriate

    :param browser: The selenium webdriver instance
    :param post_link:
    :param dont_like: hashtags of inappropriate phrases
    :param mandatory_words: words of appropriate phrases
    :param ignore_if_contains:
    :param logger: the logger instance
    :return: tuple of
        boolean: True if inappropriate,
        string: the username,
        integer: number of likes,
        integer: number of comments,
        posting_date_str: string,
        location_name: string,
        image_text: string,
        boolean: True if it is video media,
        string: the message if inappropriate else 'None',
        string: set the scope of the return value
    """

    # Check URL of the webpage, if it already is post's page, then do not
    # navigate to it again

    web_address_navigator(browser, post_link)

    # Check if the Post is Valid/Exists
    try:
        post_page = browser.execute_script(
            "return window.__additionalData[Object.keys(window.__additionalData)[0]].data"
        )

    except WebDriverException:  # handle the possible `entry_data` error
        try:
            browser.execute_script("location.reload()")
            update_activity(browser, state=None)

            post_page = browser.execute_script(
                "return window._sharedData.entry_data.PostPage[0]"
            )

        except WebDriverException:
            post_page = None

    if post_page is None:
        logger.warning("Unavailable Page: {}".format(post_link.encode("utf-8")))
        return True, None, None, None, None, None, None, None, "Unavailable Page", "Failure"

    web_address_navigator(browser, post_link)
    likes_count = get_likes(browser,logger)
    
    try:
        comments_count,comments_status = get_comments_count(browser,logger)
    except:
        comments_count = None
        comments_status = comments_status
    

    time_element = browser.find_element_by_xpath("//div/a/time")
    posting_datetime_str = time_element.get_attribute("datetime")

    # Gets the description of the post's link and checks for the dont_like tags
    graphql = "graphql" in post_page
    if graphql:
        media = post_page["graphql"]["shortcode_media"]
        is_video = media["is_video"]
        user_name = media["owner"]["username"]
        image_text = media["edge_media_to_caption"]["edges"]
        image_text = image_text[0]["node"]["text"] if image_text else None
        location = media["location"]
        location_name = location["name"] if location else None
        media_edge_string = get_media_edge_comment_string(media)
        # double {{ allows us to call .format here:
        try:
            browser.execute_script(
                "window.insta_data = window.__additionalData[Object.keys(window.__additionalData)[0]].data"
            )
        except WebDriverException:
            browser.execute_script(
                "window.insta_data = window._sharedData.entry_data.PostPage[0]"
            )
        owner_comments = browser.execute_script(
            """
            latest_comments = window.insta_data.graphql.shortcode_media.{}.edges;
            if (latest_comments === undefined) {{
                latest_comments = Array();
                owner_comments = latest_comments
                    .filter(item => item.node.owner.username == arguments[0])
                    .map(item => item.node.text)
                    .reduce((item, total) => item + '\\n' + total, '');
                return owner_comments;}}
            else {{
                return null;}}
        """.format(
                media_edge_string
            ),
            user_name,
        )

    else:
        media = post_page[0]["shortcode_media"]
        is_video = media["is_video"]
        user_name = media["owner"]["username"]
        image_text = media["caption"]
        owner_comments = browser.execute_script(
            """
            latest_comments = window._sharedData.entry_data.PostPage[
            0].media.comments.nodes;
            if (latest_comments === undefined) {
                latest_comments = Array();
                owner_comments = latest_comments
                    .filter(item => item.user.username == arguments[0])
                    .map(item => item.text)
                    .reduce((item, total) => item + '\\n' + total, '');
                return owner_comments;}
            else {
                return null;}
        """,
            user_name,
        )

    if owner_comments == "":
        owner_comments = None

    # Append owner comments to description as it might contain further tags
    if image_text is None:
        image_text = owner_comments

    elif owner_comments:
        image_text = image_text + "\n" + owner_comments

    # If the image still has no description gets the first comment
    if image_text is None:
        if graphql:
            media_edge_string = get_media_edge_comment_string(media)
            image_text = media[media_edge_string]["edges"]
            image_text = image_text[0]["node"]["text"] if image_text else None

        else:
            image_text = media["comments"]["nodes"]
            image_text = image_text[0]["text"] if image_text else None

    if image_text is None:
        image_text = "No description"

    logger.info("-Posted by: {}  image likes: {}  image comments: {}".format(user_name.encode("utf-8"),likes_count,comments_count))
#    logger.info("-Link: {}".format(post_link.encode("utf-8")))
#    logger.info("Description: {}".format(image_text.encode("utf-8")))
#    logger.info("-Posted date: {}:".format(posting_datetime_str))
#    logger.info("-Likes: {}".format(likes_count))
#    logger.info("-Comments: {}".format(comments_count))

    # Check if mandatory character set, before adding the location to the text
    if mandatory_language:
        if not check_character_set(image_text):
            return (
                True,
                user_name,
                likes_count,
                comments_count,
                posting_datetime_str,
                location_name,
                image_text,
                is_video,
                "Mandatory language not " "fulfilled",
                "Not mandatory " "language",
            )

    # Append location to image_text so we can search through both in one go
    if location_name:
        logger.info("-Location: {}".format(location_name.encode("utf-8")))
        image_text = image_text + "\n" + location_name

    if mandatory_words:
        if not any((word in image_text for word in mandatory_words)):
            return (
                True,
                user_name,
                likes_count,
                comments_count,
                posting_datetime_str,
                location_name,
                image_text,
                is_video,
                "Mandatory words not " "fulfilled",
                "Not mandatory " "likes",
            )

    image_text_lower = [x.lower() for x in image_text]
    ignore_if_contains_lower = [x.lower() for x in ignore_if_contains]
    if any((word in image_text_lower for word in ignore_if_contains_lower)):
        return False, user_name, likes_count, comments_count, posting_datetime_str,location_name,image_text,is_video, "None", "Pass"

    dont_like_regex = []

    for dont_likes in dont_like:
        if dont_likes.startswith("#"):
            dont_like_regex.append(dont_likes + r"([^\d\w]|$)")
        elif dont_likes.startswith("["):
            dont_like_regex.append("#" + dont_likes[1:] + r"[\d\w]+([^\d\w]|$)")
        elif dont_likes.startswith("]"):
            dont_like_regex.append(r"#[\d\w]+" + dont_likes[1:] + r"([^\d\w]|$)")
        else:
            dont_like_regex.append(r"#[\d\w]*" + dont_likes + r"[\d\w]*([^\d\w]|$)")

    for dont_likes_regex in dont_like_regex:
        quash = re.search(dont_likes_regex, image_text, re.IGNORECASE)
        if quash:
            quashed = (
                (((quash.group(0)).split("#")[1]).split(" ")[0])
                .split("\n")[0]
                .encode("utf-8")
            )  # dismiss possible space and newlines
            iffy = (
                (re.split(r"\W+", dont_likes_regex))[3]
                if dont_likes_regex.endswith("*([^\\d\\w]|$)")
                else (re.split(r"\W+", dont_likes_regex))[1]  # 'word' without format
                if dont_likes_regex.endswith("+([^\\d\\w]|$)")
                else (re.split(r"\W+", dont_likes_regex))[3]  # '[word'
                if dont_likes_regex.startswith("#[\\d\\w]+")
                else (re.split(r"\W+", dont_likes_regex))[1]  # ']word'
            )  # '#word'
            inapp_unit = 'Inappropriate! ~ contains "{}"'.format(
                quashed if iffy == quashed else '" in "'.join([str(iffy), str(quashed)])
            )
            return True, user_name, likes_count, comments_count, posting_datetime_str, location_name, image_text,is_video, inapp_unit, "Undesired word"

    return False, user_name, likes_count, comments_count, posting_datetime_str, location_name, image_text, is_video, "None", "Success"
Ejemplo n.º 11
0
def get_links_for_username(
    browser,
    username,
    person,
    amount,
    logger,
    logfolder,
    randomize=False,
    media=None,
    taggedImages=False,
    imageToFind=None
):
    """
    Fetches the number of links specified by amount and returns a list of links
    """

    if media is None:
        # All known media types
        media = MEDIA_ALL_TYPES
    elif media == MEDIA_PHOTO:
        # Include posts with multiple images in it
        media = [MEDIA_PHOTO, MEDIA_CAROUSEL]
    else:
        # Make it an array to use it in the following part
        media = [media]

    logger.info("Getting {} image list...".format(person))

    user_link = "https://www.instagram.com/{}/".format(person)
    if taggedImages:
        user_link = user_link + "tagged/"

    # Check URL of the webpage, if it already is user's profile page,
    # then do not navigate to it again

    web_address_navigator(browser, user_link)

    if not is_page_available(browser, logger):
        logger.error(
            "Instagram error: The link you followed may be broken, or the "
            "page may have been removed..."
        )
        return False

    # if private user, we can get links only if we following
    following_status, _ = get_following_status(
        browser, "profile", username, person, None, logger, logfolder
    )

    # if following_status is None:
    #    browser.wait_for_valid_connection(browser, username, logger)

    # if following_status == 'Follow':
    #    browser.wait_for_valid_authorization(browser, username, logger)

    is_private = is_private_profile(browser, logger, following_status == "Following")
    if (
        is_private is None
        or (is_private is True and following_status not in ["Following", True])
        or (following_status == "Blocked")
    ):
        logger.info("This user is private and we are not following")
        return False

    web_address_navigator(browser, user_link)


    # Get links
    links = []
    main_elem = browser.find_element_by_tag_name("article")
    posts_count = get_number_of_posts(browser)
    attempt = 0

    if posts_count is not None and amount > posts_count:
        logger.info(
            "You have requested to get {} posts from {}'s profile page BUT"
            " there only {} posts available :D".format(amount, person, posts_count)
        )
        amount = posts_count

    while len(links) < amount:
        initial_links = links
        sleep(1.25)
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # update server calls after a scroll request
        update_activity(browser, state=None)

        # using `extend`  or `+=` results reference stay alive which affects
        # previous assignment (can use `copy()` for it)
        main_elem = browser.find_element_by_tag_name("article")

        tempLinks = get_links(browser, person, logger, media, main_elem)
        links = links + tempLinks
        links = sorted(set(links), key=links.index)

        if len(tempLinks)>0 and (imageToFind is not None) and (imageToFind in tempLinks):
            break

        if len(links) == len(initial_links):
            logger.info("Pausing 45s during scroll for new links.  Currently at: {} of {} (attempt:{})".format(len(links),amount,attempt))
            sleep(45.0)
            if attempt >= 60:
                logger.info(
                    "There are possibly less posts than {} in {}'s profile "
                    "page!".format(amount, person)
                )
                break
            else:
                attempt += 1
        else:
            attempt = 0

    if randomize is True:
        random.shuffle(links)

    return links[:amount]
Ejemplo n.º 12
0
def delta_followers(session, user_name, max_amount, past_followers=None):
    """
    Given an instagram username and an optional list of past_followers, retrieves the list of new followers.
    :param session:
    :param user_name:
    :param max_amount:
    :param old_followers:
    :return:
    """
    if past_followers is None:
        past_followers = []

    session.quotient_breach = False

    try:

        user_name = user_name.strip()

        user_link = "https://www.instagram.com/{}/".format(user_name)
        web_address_navigator(session.browser, user_link)

        if not is_page_available(browser=session.browser,
                                 logger=session.logger):
            return []

        # check how many people are following this user.
        allfollowers, allfollowing = get_relationship_counts(
            browser=session.browser, username=user_name, logger=session.logger)
        # print(allfollowers)
        # print(allfollowing)

        # skip early for no followers
        if not allfollowers:
            session.logger.info("'{}' has no followers".format(user_name))
            return []

        elif allfollowers < max_amount:
            session.logger.warning(
                "'{}' has less followers- {}, than the given amount of {}".
                format(user_name, allfollowers, max_amount))

        # locate element to user's followers
        try:
            followers_link = session.browser.find_elements_by_xpath(
                '//a[@href="/{}/followers/"]'.format(user_name))
            if len(followers_link) > 0:
                click_element(session.browser, followers_link[0])
            else:
                session.logger.error("'{} is private'".format(user_name))
                return []
        except NoSuchElementException:
            session.logger.error(
                'Could not find followers\' link for {}'.format(user_name))
            return []

        except BaseException as e:
            session.logger.error("`followers_link` error {}".format(str(e)))
            return []

        person_list, _ = get_users_through_dialog(
            browser=session.browser,
            login=session.username,
            user_name=user_name,
            amount=max_amount,
            users_count=allfollowers,
            randomize=False,
            dont_include=[],
            blacklist=session.blacklist,
            follow_times=session.follow_times,
            simulation={
                "enabled": False,
                "percentage": 100
            },
            channel="Follow",
            jumps=session.jumps,
            logger=session.logger,
            logfolder=session.logfolder,
            past_followers=past_followers,
            wait_seconds=10,
        )

    except (TypeError, RuntimeWarning) as err:
        session.logger.error('Sorry, an error occurred: {}'.format(err))
        session.aborting = True
        return []

    return person_list