def _extract_post_info(proxy_browser, post_link): try: caption, location_url, location_name, location_id, lat, lng, imgs, img_desc, tags, likes, comments_count, \ date, user_commented_list, user_comments, mentions, user_liked_post, views = extract_post_info(proxy_browser, post_link) location = { 'location_url': location_url, 'location_name': location_name, 'location_id': location_id, 'latitude': lat, 'longitude': lng, } return { 'caption': caption, 'location': location, 'imgs': imgs, 'imgdesc': img_desc, 'date': date, 'tags': tags, 'likes': { 'count': likes, 'list': user_liked_post }, 'views': views, 'url': post_link, 'comments': { 'count': comments_count, 'list': user_comments }, 'mentions': mentions } except NoSuchElementException as err: InstaLogger.logger().error("Could not get information from post: " + post_link) InstaLogger.logger().error(err) except Exception as ex: InstaLogger.logger().error("Could not get information from post: " + post_link) return None
def extract_user_posts(browser, num_of_posts_to_do): """Get all posts from user""" links = [] links2 = [] preview_imgs = {} # list links contains 30 links from the current view, as that is the maximum Instagram is showing at one time # list links2 contains all the links collected so far # preview_imgs dictionary maps link in links2 to link's post's preview image src try: body_elem = browser.find_element_by_tag_name('body') # load_button = body_elem.find_element_by_xpath\ # ('//a[contains(@class, "_1cr2e _epyes")]') # body_elem.send_keys(Keys.END) # sleep(3) previouslen = 0 breaking = 0 print("number of posts to do: ", num_of_posts_to_do) num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12) print( "Getting first", num_of_posts_to_scroll, "posts but checking ", num_of_posts_to_do, " posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n" ) while (len(links2) < num_of_posts_to_do): prev_divs = browser.find_elements_by_tag_name('main') links_elems = [ div.find_elements_by_tag_name('a') for div in prev_divs ] links = sum( [[link_elem.get_attribute('href') for link_elem in elems] for elems in links_elems], []) for elems in links_elems: for link_elem in elems: href = link_elem.get_attribute('href') try: if "/p/" in href: try: img = link_elem.find_element_by_tag_name('img') src = img.get_attribute('src') preview_imgs[href] = src except NoSuchElementException: print("img exception 132") continue except Exception as err: print(err) for link in links: if "/p/" in link: if (len(links2) < num_of_posts_to_do): links2.append(link) links2 = list(set(links2)) print("Scrolling profile ", len(links2), "/", num_of_posts_to_scroll) body_elem.send_keys(Keys.END) sleep(Settings.sleep_time_between_post_scroll) ##remove bellow part to never break the scrolling script before reaching the num_of_posts if (len(links2) == previouslen): breaking += 1 print( "breaking in ", 4 - breaking, "...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py" ) else: breaking = 0 if breaking > 3: print("Not getting any more posts, ending scrolling") sleep(2) break previouslen = len(links2) ## except NoSuchElementException as err: InstaLogger.logger().error('Something went terribly wrong') post_infos = [] counter = 1 # into user_commented_total_list I will add all username links who commented on any post of this user user_commented_total_list = [] for postlink in links2: print("\n", counter, "/", len(links2)) counter = counter + 1 try: caption, location_url, location_name, location_id, lat, lng, imgs, imgdesc, tags, likes, commentscount, date, user_commented_list, user_comments, mentions, user_liked_post, views = extract_post_info( browser, postlink) location = { 'location_url': location_url, 'location_name': location_name, 'location_id': location_id, 'latitude': lat, 'longitude': lng, } post_infos.append({ 'caption': caption, 'location': location, 'imgs': imgs, 'imgdesc': imgdesc, 'preview_img': preview_imgs.get(postlink, None), 'date': date, 'tags': tags, 'likes': { 'count': likes, 'list': user_liked_post }, 'views': views, 'url': postlink, 'comments': { 'count': commentscount, 'list': user_comments }, 'mentions': mentions }) user_commented_total_list = user_commented_total_list + user_commented_list except NoSuchElementException as err: InstaLogger.logger().error( "Could not get information from post: " + postlink) InstaLogger.logger().error(err) except: InstaLogger.logger().error( "Could not get information from post: " + postlink) return post_infos, user_commented_total_list