Exemple #1
0
def _extract_post_info(proxy_browser, post_link):
    try:

        caption, location_url, location_name, location_id, lat, lng, imgs, img_desc, tags, likes, comments_count, \
        date, user_commented_list, user_comments, mentions, user_liked_post, views = extract_post_info(proxy_browser, post_link)

        location = {
            'location_url': location_url,
            'location_name': location_name,
            'location_id': location_id,
            'latitude': lat,
            'longitude': lng,
        }

        return {
            'caption': caption,
            'location': location,
            'imgs': imgs,
            'imgdesc': img_desc,
            'date': date,
            'tags': tags,
            'likes': {
                'count': likes,
                'list': user_liked_post
            },
            'views': views,
            'url': post_link,
            'comments': {
                'count': comments_count,
                'list': user_comments
            },
            'mentions': mentions
        }
    except NoSuchElementException as err:
        InstaLogger.logger().error("Could not get information from post: " +
                                   post_link)
        InstaLogger.logger().error(err)
    except Exception as ex:
        InstaLogger.logger().error("Could not get information from post: " +
                                   post_link)
    return None
Exemple #2
0
def extract_user_posts(browser, num_of_posts_to_do):
    """Get all posts from user"""
    links = []
    links2 = []
    preview_imgs = {}

    # list links contains 30 links from the current view, as that is the maximum Instagram is showing at one time
    # list links2 contains all the links collected so far
    # preview_imgs dictionary maps link in links2 to link's post's preview image src
    try:
        body_elem = browser.find_element_by_tag_name('body')

        # load_button = body_elem.find_element_by_xpath\
        #  ('//a[contains(@class, "_1cr2e _epyes")]')
        # body_elem.send_keys(Keys.END)
        # sleep(3)

        previouslen = 0
        breaking = 0

        print("number of posts to do: ", num_of_posts_to_do)
        num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12)
        print(
            "Getting first", num_of_posts_to_scroll, "posts but checking ",
            num_of_posts_to_do,
            " posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n"
        )
        while (len(links2) < num_of_posts_to_do):

            prev_divs = browser.find_elements_by_tag_name('main')
            links_elems = [
                div.find_elements_by_tag_name('a') for div in prev_divs
            ]
            links = sum(
                [[link_elem.get_attribute('href') for link_elem in elems]
                 for elems in links_elems], [])

            for elems in links_elems:
                for link_elem in elems:

                    href = link_elem.get_attribute('href')
                    try:
                        if "/p/" in href:
                            try:
                                img = link_elem.find_element_by_tag_name('img')
                                src = img.get_attribute('src')
                                preview_imgs[href] = src
                            except NoSuchElementException:
                                print("img exception 132")
                                continue
                    except Exception as err:
                        print(err)

            for link in links:
                if "/p/" in link:
                    if (len(links2) < num_of_posts_to_do):
                        links2.append(link)
            links2 = list(set(links2))
            print("Scrolling profile ", len(links2), "/",
                  num_of_posts_to_scroll)
            body_elem.send_keys(Keys.END)
            sleep(Settings.sleep_time_between_post_scroll)

            ##remove bellow part to never break the scrolling script before reaching the num_of_posts
            if (len(links2) == previouslen):
                breaking += 1
                print(
                    "breaking in ", 4 - breaking,
                    "...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py"
                )
            else:
                breaking = 0
            if breaking > 3:
                print("Not getting any more posts, ending scrolling")
                sleep(2)
                break
            previouslen = len(links2)
            ##

    except NoSuchElementException as err:
        InstaLogger.logger().error('Something went terribly wrong')

    post_infos = []

    counter = 1
    # into user_commented_total_list I will add all username links who commented on any post of this user
    user_commented_total_list = []

    for postlink in links2:

        print("\n", counter, "/", len(links2))
        counter = counter + 1

        try:
            caption, location_url, location_name, location_id, lat, lng, imgs, imgdesc, tags, likes, commentscount, date, user_commented_list, user_comments, mentions, user_liked_post, views = extract_post_info(
                browser, postlink)

            location = {
                'location_url': location_url,
                'location_name': location_name,
                'location_id': location_id,
                'latitude': lat,
                'longitude': lng,
            }

            post_infos.append({
                'caption': caption,
                'location': location,
                'imgs': imgs,
                'imgdesc': imgdesc,
                'preview_img': preview_imgs.get(postlink, None),
                'date': date,
                'tags': tags,
                'likes': {
                    'count': likes,
                    'list': user_liked_post
                },
                'views': views,
                'url': postlink,
                'comments': {
                    'count': commentscount,
                    'list': user_comments
                },
                'mentions': mentions
            })
            user_commented_total_list = user_commented_total_list + user_commented_list
        except NoSuchElementException as err:
            InstaLogger.logger().error(
                "Could not get information from post: " + postlink)
            InstaLogger.logger().error(err)
        except:
            InstaLogger.logger().error(
                "Could not get information from post: " + postlink)
    return post_infos, user_commented_total_list