def extract_information(browser, username, should_extract_followers, limit_amount): InstaLogger.logger().info('Extracting information from ' + username) """Get all the information for the given username""" isprivate = False try: user_link = "https://www.instagram.com/{}/".format(username) web_adress_navigator(browser, user_link) except PageNotFound404 as e: raise NoInstaProfilePageFound(e) num_of_posts_to_do = 999999 try: userinfo = get_user_info(browser, username, should_extract_followers) if limit_amount < 1: limit_amount = 999999 num_of_posts_to_do = min(limit_amount, userinfo['num_of_posts']) except Exception as err: InstaLogger.logger().error("Couldn't get user profile. - Terminating") quit() prev_divs = browser.find_elements_by_class_name('_70iju') post_infos = [] user_commented_total_list = [] if Settings.scrape_posts_infos is True and isprivate is False: try: post_infos, user_commented_total_list = extract_user_posts( browser, num_of_posts_to_do) except: InstaLogger.logger().error("Couldn't get user posts.") userinfo['posts'] = post_infos userinfo['scraped'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") InstaLogger.logger().info("User " + username + " has " + str(len(user_commented_total_list)) + " comments.") # sorts the list by frequencies, so users who comment the most are at the top import collections from operator import itemgetter, attrgetter counter = collections.Counter(user_commented_total_list) com = sorted(counter.most_common(), key=itemgetter(1, 0), reverse=True) com = map(lambda x: [x[0]] * x[1], com) user_commented_total_list = [item for sublist in com for item in sublist] # remove duplicates preserving order (that's why not using set()) user_commented_list = [] last = '' for i in range(len(user_commented_total_list)): if username.lower() != user_commented_total_list[i]: if last != user_commented_total_list[i]: user_commented_list.append(user_commented_total_list[i]) last = user_commented_total_list[i] user_commented_list = [] return userinfo, user_commented_list
def extract_information(browser, username): try: user_link = "https://www.instagram.com/{}/".format(username) web_adress_navigator(browser, user_link) except PageNotFound404 as e: raise NoInstaProfilePageFound(e) try: userinfo = get_user_info(browser, username) except Exception as err: quit() return userinfo
def extract_followers(browser, username): InstaLogger.logger().info('Extracting follower from ' + username) try: user_link = "https://www.instagram.com/{}".format(username) web_adress_navigator(browser, user_link) except PageNotFound404 as e: raise NoInstaProfilePageFound(e) sleep(5) followers = [] # find number of followers elem = browser.find_element_by_xpath( "//div[@id='react-root']//header[@class='vtbgv ']//ul[@class='k9GMp ']/child::li[2]/a/span" ) elem.click() sleep(15) # remove suggestion list and load 24 list elements after this browser.execute_script( "document.getElementsByClassName('isgrP')[0].scrollTo(0,500)") sleep(10) elems = browser.find_elements_by_xpath( "//body//div[@class='PZuss']//a[@class='FPmhX notranslate _0imsa ']") for i in range(12): val = elems[i].get_attribute('innerHTML') followers.append(val) for i in range(12): browser.execute_script( "document.getElementsByClassName('PZuss')[0].children[0].remove()") isDone = False while 1: try: start = time() browser.execute_script( "document.getElementsByClassName('isgrP')[0].scrollTo(0,document.getElementsByClassName('isgrP')[0].scrollHeight)" ) while 1: try: if int( browser.execute_script( "return document.getElementsByClassName('PZuss')[0].children.length" )) == 24: break except (KeyboardInterrupt, SystemExit): # f.close() raise except: continue if time() - start > 10: isDone = True break if isDone: break elems = browser.find_elements_by_xpath( "//body//div[@class='PZuss']//a[@class='FPmhX notranslate _0imsa ']" ) list_segment = "" for i in range(12): val = elems[i].get_attribute('innerHTML') list_segment += (val + '\n') followers.append(val) for i in range(12): browser.execute_script( "document.getElementsByClassName('PZuss')[0].children[0].remove()" ) InstaLogger.logger().info(time() - start) except (KeyboardInterrupt, SystemExit): # f.close() raise except: continue list_segment = "" elems = browser.find_elements_by_xpath( "//body//div[@class='PZuss']//a[@class='FPmhX notranslate _0imsa ']") for i in range(len(elems)): val = elems[i].get_attribute('innerHTML') list_segment += (val + '\n') followers.append(val) return followers
def extract_user_posts_links(browser, username, limit_amount): InstaLogger.logger().info('Extracting information from ' + username) """Get all the information for the given username""" try: user_link = "https://www.instagram.com/{}/".format(username) web_adress_navigator(browser, user_link) except PageNotFound404 as e: raise NoInstaProfilePageFound(e) num_of_posts_to_do = 999999 user_info = {} try: user_info = get_user_info(browser, username) if limit_amount < 1: limit_amount = 999999 num_of_posts_to_do = min(limit_amount, user_info['num_of_posts']) except Exception as err: InstaLogger.logger().error("Couldn't get user profile. - Terminating") quit() """Get all posts from user""" indexed_links = dict() preview_images = {} try: body_elem = browser.find_element_by_tag_name('body') previouslen = 0 breaking = 0 print("number of posts to do: ", num_of_posts_to_do) num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12) print( "Getting first", num_of_posts_to_scroll, "posts but checking ", num_of_posts_to_do, " posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n" ) while (len(indexed_links) < num_of_posts_to_do): prev_divs = browser.find_elements_by_tag_name('main') links_elems = [ div.find_elements_by_tag_name('a') for div in prev_divs ] links = sum( [[link_elem.get_attribute('href') for link_elem in elems] for elems in links_elems], []) for elems in links_elems: for link_elem in elems: href = link_elem.get_attribute('href') try: if "/p/" in href: try: img = link_elem.find_element_by_tag_name('img') src = img.get_attribute('src') preview_images[href] = src except NoSuchElementException: print("img exception 132") continue except Exception as err: print(err) for link in links: if "/p/" in link: if (len(indexed_links) < num_of_posts_to_do): if link not in indexed_links: indexed_links[link] = len(indexed_links) print("Scrolling profile ", len(indexed_links), "/", num_of_posts_to_scroll) body_elem.send_keys(Keys.END) sleep_time = Settings.sleep_time_between_post_scroll sleep(random.uniform(sleep_time - 1, sleep_time + 1)) ##remove bellow part to never break the scrolling script before reaching the num_of_posts if (len(indexed_links) == previouslen): breaking += 1 print( "breaking in ", 4 - breaking, "...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py" ) else: breaking = 0 if breaking > 3: print("Not getting any more posts, ending scrolling") sleep(2) break previouslen = len(indexed_links) ## except NoSuchElementException as err: InstaLogger.logger().error('Something went terribly wrong') return user_info, indexed_links, preview_images