def extract_information(browser, username, should_extract_followers,
                        limit_amount):
    InstaLogger.logger().info('Extracting information from ' + username)
    """Get all the information for the given username"""
    isprivate = False
    try:
        user_link = "https://www.instagram.com/{}/".format(username)
        web_adress_navigator(browser, user_link)
    except PageNotFound404 as e:
        raise NoInstaProfilePageFound(e)

    num_of_posts_to_do = 999999

    try:
        userinfo = get_user_info(browser, username, should_extract_followers)
        if limit_amount < 1:
            limit_amount = 999999
        num_of_posts_to_do = min(limit_amount, userinfo['num_of_posts'])
    except Exception as err:
        InstaLogger.logger().error("Couldn't get user profile. - Terminating")
        quit()

    prev_divs = browser.find_elements_by_class_name('_70iju')

    post_infos = []
    user_commented_total_list = []
    if Settings.scrape_posts_infos is True and isprivate is False:
        try:
            post_infos, user_commented_total_list = extract_user_posts(
                browser, num_of_posts_to_do)
        except:
            InstaLogger.logger().error("Couldn't get user posts.")

    userinfo['posts'] = post_infos
    userinfo['scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    InstaLogger.logger().info("User " + username + " has " +
                              str(len(user_commented_total_list)) +
                              " comments.")

    # sorts the list by frequencies, so users who comment the most are at the top
    import collections
    from operator import itemgetter, attrgetter
    counter = collections.Counter(user_commented_total_list)
    com = sorted(counter.most_common(), key=itemgetter(1, 0), reverse=True)
    com = map(lambda x: [x[0]] * x[1], com)
    user_commented_total_list = [item for sublist in com for item in sublist]

    # remove duplicates preserving order (that's why not using set())
    user_commented_list = []
    last = ''
    for i in range(len(user_commented_total_list)):
        if username.lower() != user_commented_total_list[i]:
            if last != user_commented_total_list[i]:
                user_commented_list.append(user_commented_total_list[i])
            last = user_commented_total_list[i]
    user_commented_list = []

    return userinfo, user_commented_list
Example #2
0
def extract_information(browser, username):
    try:
        user_link = "https://www.instagram.com/{}/".format(username)
        web_adress_navigator(browser, user_link)
    except PageNotFound404 as e:
        raise NoInstaProfilePageFound(e)

    try:
        userinfo = get_user_info(browser, username)
    except Exception as err:
        quit()

    return userinfo
Example #3
0
def extract_followers(browser, username):
    InstaLogger.logger().info('Extracting follower from ' + username)
    try:
        user_link = "https://www.instagram.com/{}".format(username)
        web_adress_navigator(browser, user_link)
    except PageNotFound404 as e:
        raise NoInstaProfilePageFound(e)
    sleep(5)

    followers = []

    # find number of followers
    elem = browser.find_element_by_xpath(
        "//div[@id='react-root']//header[@class='vtbgv ']//ul[@class='k9GMp ']/child::li[2]/a/span"
    )
    elem.click()
    sleep(15)

    # remove suggestion list and load 24 list elements after this
    browser.execute_script(
        "document.getElementsByClassName('isgrP')[0].scrollTo(0,500)")
    sleep(10)

    elems = browser.find_elements_by_xpath(
        "//body//div[@class='PZuss']//a[@class='FPmhX notranslate  _0imsa ']")
    for i in range(12):
        val = elems[i].get_attribute('innerHTML')
        followers.append(val)

    for i in range(12):
        browser.execute_script(
            "document.getElementsByClassName('PZuss')[0].children[0].remove()")

    isDone = False

    while 1:
        try:

            start = time()
            browser.execute_script(
                "document.getElementsByClassName('isgrP')[0].scrollTo(0,document.getElementsByClassName('isgrP')[0].scrollHeight)"
            )

            while 1:
                try:
                    if int(
                            browser.execute_script(
                                "return document.getElementsByClassName('PZuss')[0].children.length"
                            )) == 24:
                        break
                except (KeyboardInterrupt, SystemExit):
                    # f.close()
                    raise
                except:
                    continue
                if time() - start > 10:
                    isDone = True
                    break

            if isDone:
                break

            elems = browser.find_elements_by_xpath(
                "//body//div[@class='PZuss']//a[@class='FPmhX notranslate  _0imsa ']"
            )
            list_segment = ""
            for i in range(12):
                val = elems[i].get_attribute('innerHTML')
                list_segment += (val + '\n')
                followers.append(val)

            for i in range(12):
                browser.execute_script(
                    "document.getElementsByClassName('PZuss')[0].children[0].remove()"
                )

            InstaLogger.logger().info(time() - start)

        except (KeyboardInterrupt, SystemExit):
            # f.close()
            raise
        except:
            continue

    list_segment = ""
    elems = browser.find_elements_by_xpath(
        "//body//div[@class='PZuss']//a[@class='FPmhX notranslate  _0imsa ']")
    for i in range(len(elems)):
        val = elems[i].get_attribute('innerHTML')
        list_segment += (val + '\n')
        followers.append(val)

    return followers
def extract_user_posts_links(browser, username, limit_amount):
    InstaLogger.logger().info('Extracting information from ' + username)
    """Get all the information for the given username"""

    try:
        user_link = "https://www.instagram.com/{}/".format(username)
        web_adress_navigator(browser, user_link)
    except PageNotFound404 as e:
        raise NoInstaProfilePageFound(e)

    num_of_posts_to_do = 999999

    user_info = {}

    try:
        user_info = get_user_info(browser, username)
        if limit_amount < 1:
            limit_amount = 999999
        num_of_posts_to_do = min(limit_amount, user_info['num_of_posts'])
    except Exception as err:
        InstaLogger.logger().error("Couldn't get user profile. - Terminating")
        quit()
    """Get all posts from user"""
    indexed_links = dict()
    preview_images = {}

    try:
        body_elem = browser.find_element_by_tag_name('body')

        previouslen = 0
        breaking = 0

        print("number of posts to do: ", num_of_posts_to_do)
        num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12)
        print(
            "Getting first", num_of_posts_to_scroll, "posts but checking ",
            num_of_posts_to_do,
            " posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n"
        )
        while (len(indexed_links) < num_of_posts_to_do):

            prev_divs = browser.find_elements_by_tag_name('main')
            links_elems = [
                div.find_elements_by_tag_name('a') for div in prev_divs
            ]
            links = sum(
                [[link_elem.get_attribute('href') for link_elem in elems]
                 for elems in links_elems], [])

            for elems in links_elems:
                for link_elem in elems:

                    href = link_elem.get_attribute('href')
                    try:
                        if "/p/" in href:
                            try:
                                img = link_elem.find_element_by_tag_name('img')
                                src = img.get_attribute('src')
                                preview_images[href] = src
                            except NoSuchElementException:
                                print("img exception 132")
                                continue
                    except Exception as err:
                        print(err)

            for link in links:
                if "/p/" in link:
                    if (len(indexed_links) < num_of_posts_to_do):
                        if link not in indexed_links:
                            indexed_links[link] = len(indexed_links)
            print("Scrolling profile ", len(indexed_links), "/",
                  num_of_posts_to_scroll)
            body_elem.send_keys(Keys.END)
            sleep_time = Settings.sleep_time_between_post_scroll
            sleep(random.uniform(sleep_time - 1, sleep_time + 1))

            ##remove bellow part to never break the scrolling script before reaching the num_of_posts
            if (len(indexed_links) == previouslen):
                breaking += 1
                print(
                    "breaking in ", 4 - breaking,
                    "...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py"
                )
            else:
                breaking = 0
            if breaking > 3:
                print("Not getting any more posts, ending scrolling")
                sleep(2)
                break
            previouslen = len(indexed_links)
            ##

    except NoSuchElementException as err:
        InstaLogger.logger().error('Something went terribly wrong')

    return user_info, indexed_links, preview_images