def web_adress_navigator(browser, link): """Checks and compares current URL of web page and the URL to be navigated and if it is different, it does navigate""" try: current_url = browser.current_url print(current_url) except WebDriverException: try: current_url = browser.execute_script("return window.location.href") except WebDriverException: current_url = None if current_url is None or current_url != link: response = browser.get(link) if check_page_title_notfound(browser): InstaLogger.logger().error("Failed to get page " + link) raise PageNotFound404("Failed to get page " + link) #if response.status_code == 404: # InstaLogger.logger().error("Failed to get page " + link) # raise PageNotFound404() # update server calls WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.ID, "viewport")))
def extract_users_from_comments(self, comments): # adding who commented into user_commented_list user_commented_list = [] user_comments = [] for comm in comments: try: user_commented = comm.find_element_by_tag_name('a').get_attribute("href").split('/') user_commented_list.append(user_commented[3]) except: InstaLogger.logger().error("ERROR something went wrong getting user_commented") # first comment has to be loaded every time to get the caption and tag from post if (Settings.output_comments is True or len(user_comments) < 1): user_comment = {} try: user_comment = { 'user': user_commented[3], 'comment': comm.find_element_by_css_selector('h2 + span, h3 + span').text } InstaLogger.logger().info(user_comment) InstaLogger.logger().info( user_commented[3] + " -- " + comm.find_element_by_css_selector('h2 + span, h3 + span').text) user_comments.append(user_comment) except: InstaLogger.logger().error("ERROR something went wrong getting comment") InstaLogger.logger().info(str(len(user_commented_list)) + " comments.") return user_commented_list, user_comments
def extract_comments(self): comments_found_last_run = 0 comments_run_same_length = 0 comments = [] try: if self.post.find_elements_by_tag_name('ul'): comment_list = self.post.find_element_by_tag_name('ul') comments = comment_list.find_elements_by_tag_name('li') """ if len(comments) > 1: # load hidden comments comments = load_more_comments(comments) InstaLogger.logger().info(f"found comments: {len(comments)}") else: InstaLogger.logger().info("found comment: 1") """ except BaseException as e: InstaLogger.logger().error(e) except: InstaLogger.logger().error("Error - getting comments") return comments, int(len(comments) - 1)
def extract_post_likers(browser, post, postlink, likes): user_liked_list = [] if (Settings.scrape_posts_likers is False): return user_liked_list else: InstaLogger.logger().info("GETTING LIKERS FROM POST") postlink = postlink + "liked_by/" try: post.find_element_by_xpath("//a[@class='zV_Nj']").click() likers_list = post.find_elements_by_xpath( "//li[@class='wo9IH']//a[contains(@class, 'FPmhX')]") tried_catch_likers = 0 while len(likers_list) < likes: likers_list_before = len(likers_list) InstaLogger.logger().info("found likers: " + str(len(likers_list)) + " should be " + str(likes) + " -- scroll for more") try: div_likebox_elem = browser.find_element_by_xpath( "//div[contains(@class, 'wwxN2')]") browser.execute_script( "arguments[0].scrollTop = arguments[0].scrollHeight", div_likebox_elem) except BaseException as e: tried_catch_likers = tried_catch_likers + 1 print("error on scrolling - next try (tried: " + str(tried_catch_likers) + ") Message:" + e) sleep(Settings.sleep_time_between_post_scroll) likers_list = post.find_elements_by_xpath( "//li[@class='wo9IH']//a[contains(@class, 'FPmhX')]") if (likers_list_before == len(likers_list)): tried_catch_likers = tried_catch_likers + 1 print("error on scrolling - next try (tried: " + str(tried_catch_likers) + ")") sleep(Settings.sleep_time_between_post_scroll * 1.5) if tried_catch_likers > 10: print("exit scrolling likers") break likers_list = post.find_elements_by_xpath( "//li[@class='wo9IH']//a[contains(@class, 'FPmhX')]") for liker in likers_list: user_like = liker.get_attribute("href").split('/') user_liked_list.append(user_like[3]) InstaLogger.logger().info('likers: ' + str(len(user_liked_list))) except BaseException as e: InstaLogger.logger().error("Error - getting post likers") InstaLogger.logger().error(e) return user_liked_list
def extract_caption(self, user_comments, username): caption = '' if len(user_comments) > 0: user_commented = user_comments[0] if username == user_commented['user']: caption = user_commented['comment'] InstaLogger.logger().info(f"caption: {caption}") return caption
def extract_username(self): username = '' try: username = self.post.find_element_by_class_name('e1e1d').find_element_by_tag_name('a').text except: InstaLogger.logger().error("ERROR - getting Post infos (username) ") return username
def extract_date(self): date = '' try: date = self.post.find_element_by_xpath('//a/time').get_attribute("datetime") InstaLogger.logger().info("Post date: " + str(date)) except: InstaLogger.logger().error("ERROR - getting Post Date ") return date
def extract_information(browser, username, limit_amount): """Get all the information for the given username""" InstaLogger.logger().info('Extracting information from ' + username) isprivate = False try: user_link = "https://www.instagram.com/{}/".format(username) web_adress_navigator(browser, user_link) except PageNotFound404 as e: raise NoInstaProfilePageFound(e) num_of_posts_to_do = 999999 ig_user = InstagramUser(browser, username) ig_user.get_user_info() if limit_amount < 1: limit_amount = 999999 num_of_posts_to_do = min(limit_amount, ig_user.num_of_posts['count']) prev_divs = browser.find_elements_by_class_name('_70iju') post_infos = [] user_commented_total_list = [] if Settings.scrape_posts_infos is True and ig_user.isprivate is False: post_infos, user_commented_total_list = quick_post_extract( browser, num_of_posts_to_do) ig_user.posts = post_infos ig_user.scraped = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") InstaLogger.logger().info("User " + username + " has " + str(len(user_commented_total_list)) + " comments.") # sorts the list by frequencies, so users who comment the most are at the top import collections from operator import itemgetter, attrgetter counter = collections.Counter(user_commented_total_list) com = sorted(counter.most_common(), key=itemgetter(1, 0), reverse=True) com = map(lambda x: [x[0]] * x[1], com) user_commented_total_list = [item for sublist in com for item in sublist] # remove duplicates preserving order (that's why not using set()) user_commented_list = [] last = '' for i in range(len(user_commented_total_list)): if username.lower() != user_commented_total_list[i]: if last != user_commented_total_list[i]: user_commented_list.append(user_commented_total_list[i]) last = user_commented_total_list[i] return ig_user, user_commented_list
def extract_post_mentions(self): mentions = [] if (Settings.mentions is False): return mentions if self.post.find_elements_by_class_name('JYWcJ'): # perhaps JYWcJ mention_list = self.post.find_elements_by_class_name('JYWcJ') # perhaps JYWcJ for mention in mention_list: user_mention = mention.get_attribute("href").split('/') mentions.append(user_mention[3]) InstaLogger.logger().info(f"mentions: {str(len(mentions))}") return mentions
def extract_post_caption(user_comments, username): tags = [] caption = '' try: if len(user_comments) > 0: user_commented = user_comments[0] if username == user_commented['user']: caption = user_commented['comment'] InstaLogger.logger().info("caption:" + caption) tags = findall(r'#[A-Za-z0-9]*', caption) except: InstaLogger.logger().error("getting caption") return caption, tags
def extract_post_mentions(browser, post): mentions = [] if (Settings.mentions is True): try: if post.find_elements_by_class_name('xUdfV'): # perhaps JYWcJ mention_list = post.find_elements_by_class_name('xUdfV') # perhaps JYWcJ for mention in mention_list: user_mention = mention.get_attribute("href").split('/') mentions.append(user_mention[3]) InstaLogger.logger().info(str(len(mentions)) + "mentions") except: InstaLogger.logger().error("getting mentions") return mentions
def extract_comments(self): print("extracting comments") comments_found_last_run = 0 comments_run_same_length = 0 comments = [] try: # if self.post.find_elements_by_tag_name('ul'): # comment_list = self.post.find_element_by_tag_name('ul') # comments = comment_list.find_elements_by_tag_name('li') # print("hello") # """ # if len(comments) > 1: # # load hidden comments # comments = load_more_comments(comments) # InstaLogger.logger().info(f"found comments: {len(comments)}") # else: # InstaLogger.logger().info("found comment: 1") # """ # RISHABH while self.browser.find_element_by_class_name('dCJp8'): more_button = self.browser.find_element_by_class_name('dCJp8') InstaLogger.logger().info("clicking button for loading more comments") self.browser.execute_script("arguments[0].click();", more_button) sleep(Settings.sleep_time_between_comment_loading) except BaseException as e: InstaLogger.logger().error(e) except: InstaLogger.logger().error("Error - getting more comments") print("Loaded all comments.") try: if self.browser.find_element_by_class_name('C4VMK'): cmnts = self.browser.find_elements_by_class_name('C4VMK') cmnt_text = [] for cmnt in cmnts: c = cmnt.find_elements_by_tag_name('span')[-1].get_attribute('innerHTML') u = cmnt.find_element_by_class_name('_6lAjh').find_element_by_tag_name('a').get_attribute('innerHTML') cmnt_text.append((u, c)) #save comments in a file - RISHABH post_name = self.postlink.split("/")[-2] print("saving to file: " + self.username + "/" + post_name + ".txt") with open(self.username + "/" + post_name + ".txt", 'w+') as f: f.write("author\tpostlink\tcommenter\tcomment\n") for t in cmnt_text: f.write(self.username + "\t") f.write(post_name + "\t") f.write('\t'.join(str(s) for s in t) + '\n') except: InstaLogger.logger().error("Error - getting comments") return comments, int(len(comments) - 1)
def init_chromedriver(chrome_options, capabilities): chromedriver_location = Settings.chromedriver_location try: browser = webdriver.Chrome(chromedriver_location, desired_capabilities=capabilities, chrome_options=chrome_options) except WebDriverException as exc: InstaLogger.logger().error('ensure chromedriver is installed at {}'.format( Settings.chromedriver_location)) raise exc matches = re.match(r'^(\d+\.\d+)', browser.capabilities['chrome']['chromedriverVersion']) if float(matches.groups()[0]) < Settings.chromedriver_min_version: InstaLogger.logger().error('chromedriver {} is not supported, expects {}+'.format( float(matches.groups()[0]), Settings.chromedriver_min_version)) browser.close() raise Exception('wrong chromedriver version') return browser
def extract_user_posts(browser, num_of_posts_to_do): links2, preview_imgs = get_num_posts(browser, num_of_posts_to_do) post_infos = [] counter = 1 # into user_commented_total_list I will add all username links who commented on any post of this user user_commented_total_list = [] for postlink in links2: InstaLogger.logger().info(f"\n {counter} / {len(links2)}") counter = counter + 1 try: instagram_post = InstagramPost(browser, postlink) instagram_post.extract_post_info() location = { 'location_url': instagram_post.location_url, 'location_name': instagram_post.location_name, 'location_id': instagram_post.location_id, 'latitude': instagram_post.lat, 'longitude': instagram_post.lng, } post_infos.append({ 'caption': instagram_post.caption, 'location': location, 'imgs': instagram_post.imgs, 'imgdesc': instagram_post.imgdesc, 'preview_img': preview_imgs.get(instagram_post.postlink, None), 'date': instagram_post.date, 'tags': instagram_post.tags, 'likes': { 'count': instagram_post.likes, 'list': instagram_post.user_liked_list }, 'views': instagram_post.views, 'url': instagram_post.postlink, 'comments': { 'count': instagram_post.commentscount, 'list': instagram_post.user_comments }, 'mentions': instagram_post.mentions }) user_commented_total_list = user_commented_total_list + instagram_post.user_commented_list except NoSuchElementException as err: InstaLogger.logger().error("Could not get information from post: " + instagram_post.postlink) InstaLogger.logger().error(err) return post_infos, user_commented_total_list
def __init__(self, browser, postlink): self.browser = browser self.postlink = postlink try: InstaLogger.logger().info("Scraping Post Link: " + self.postlink) web_adress_navigator(self.browser, self.postlink) except PageNotFound404 as e: raise NoInstaPostPageFound(e) except NoSuchElementException as err: InstaLogger.logger().error("Could not get information from post: " + self.postlink) InstaLogger.logger().error(err) pass self.post = self.browser.find_element_by_class_name('ltEKP')
def extract_image_data(self): img_tags = [] imgs = [] imgdesc = [] img_tags = self.post.find_elements_by_class_name('FFVAD') InstaLogger.logger().info("number of images: " + str(len(img_tags))) for i in img_tags: imgs.append(i.get_attribute('src')) imgdesc.append(i.get_attribute('alt')) InstaLogger.logger().info(f"post image: {imgs[-1]}") InstaLogger.logger().info(f"alt text: {imgdesc[-1]}") return img_tags, imgs, imgdesc
def extract_post_mentions(browser, post): mentions = [] if (Settings.mentions is False): return mentions try: if post.find_elements_by_class_name('JYWcJ'): # perhaps JYWcJ mention_list = post.find_elements_by_class_name( 'JYWcJ') # perhaps JYWcJ for mention in mention_list: user_mention = mention.get_attribute("href").split('/') mentions.append(user_mention[3]) InstaLogger.logger().info("mentions: " + str(len(mentions))) except Exception as err: InstaLogger.logger().error("Error - getting mentions") InstaLogger.logger().error(err) return mentions
def _extract_post_info(proxy_browser, post_link): try: caption, location_url, location_name, location_id, lat, lng, imgs, img_desc, tags, likes, comments_count, \ date, user_commented_list, user_comments, mentions, user_liked_post, views = extract_post_info(proxy_browser, post_link) location = { 'location_url': location_url, 'location_name': location_name, 'location_id': location_id, 'latitude': lat, 'longitude': lng, } return { 'caption': caption, 'location': location, 'imgs': imgs, 'imgdesc': img_desc, 'date': date, 'tags': tags, 'likes': { 'count': likes, 'list': user_liked_post }, 'views': views, 'url': post_link, 'comments': { 'count': comments_count, 'list': user_comments }, 'mentions': mentions } except NoSuchElementException as err: InstaLogger.logger().error("Could not get information from post: " + post_link) InstaLogger.logger().error(err) except Exception as ex: InstaLogger.logger().error("Could not get information from post: " + post_link) return None
def get_user_info(self): """Get the basic user info from the profile screen""" self.isprivate = self._is_user_private() self.alias = self._user_alias() self.bio = self._user_bio() self.bio_url = self._user_bio_url() self.profile_image = self._user_profile_image() infos = self.container.find_elements_by_class_name('Y8-fY') if infos: self.num_of_posts = {'count': extract_exact_info(infos[0])} self.following = {'count': extract_exact_info(infos[2])} self.followers = {'count': extract_exact_info(infos[1])} if Settings.scrape_follower == True: if not self.isprivate: self.followers['list'] = extract_followers( self.browser, self.username) InstaLogger.logger().info("Alias name: " + self.alias) InstaLogger.logger().info("Bio: " + self.bio) InstaLogger.logger().info("Url: " + self.bio_url) InstaLogger.logger().info("Posts: " + str(self.num_of_posts)) InstaLogger.logger().info("Follower: " + str(self.followers['count'])) InstaLogger.logger().info("Following: " + str(self.following)) InstaLogger.logger().info("Is private: " + str(self.isprivate))
def quick_post_extract(browser, num_of_posts_to_do): body_elem = browser.find_element_by_tag_name('body') previouslen = 0 breaking = 0 num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12) post_infos = [] posts_set = set() posts_set_len = 0 while (posts_set_len < num_of_posts_to_do): print(posts_set_len) JSGetPostsFromReact = """ var feed = document.getElementsByTagName('article')[0]; var __reactInternalInstanceKey = Object.keys(feed).filter(k=>k.startsWith('__reactInternalInstance'))[0] var posts = feed[__reactInternalInstanceKey].return.stateNode.state.combinedPosts return posts; """ posts_json = browser.execute_script(JSGetPostsFromReact) for post_json in posts_json: # TODO: Convert to InstagramPost # instagram_post = InstagramPost.from_react_json(post_json) post_code = post_json['code'] if post_code in posts_set: continue posts_set.add(post_code) location = {} if post_json.get('location'): loc_id = post_json['location']['id'] loc_slug = post_json['location']['slug'] location = { 'location_url': f"https://www.instagram.com/explore/locations/{loc_id}/{loc_slug}/", 'location_name': post_json['location']['name'], 'location_id': loc_id, 'latitude': post_json['location']['lat'], 'longitude': post_json['location']['lng'], } num_comments = post_json['numComments'] num_likes = post_json.get('numLikes') or post_json.get( 'numPreviewLikes', -1) post_infos.append({ 'caption': post_json.get('caption'), 'location': location, 'imgs': [], 'imgdesc': [], 'preview_img': post_json['thumbnailResources'], 'date': post_json['postedAt'], 'tags': [], 'likes': { 'count': num_likes, 'list': [] }, 'views': post_json.get('videoViews', -1), 'url': f"https://www.instagram.com/p/{post_code}/", 'comments': { 'count': num_comments, 'list': [] }, 'mentions': [] }) body_elem.send_keys(Keys.END) sleep(Settings.sleep_time_between_post_scroll) posts_set_len = len(posts_set) ##remove below part to never break the scrolling script before reaching the num_of_posts if (posts_set_len == previouslen): breaking += 1 InstaLogger.logger().info( f"breaking in {4 - breaking}...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py" ) else: breaking = 0 if breaking > 3: InstaLogger.logger().info( "Not getting any more posts, ending scrolling") sleep(2) break previouslen = len(post_infos) return post_infos, []
def get_num_posts(browser, num_of_posts_to_do): """Get all posts from user""" links = [] links2 = [] preview_imgs = {} # list links contains 30 links from the current view, as that is the maximum Instagram is showing at one time # list links2 contains all the links collected so far # preview_imgs dictionary maps link in links2 to link's post's preview image src try: body_elem = browser.find_element_by_tag_name('body') # load_button = body_elem.find_element_by_xpath\ # ('//a[contains(@class, "_1cr2e _epyes")]') # body_elem.send_keys(Keys.END) # sleep(3) previouslen = 0 breaking = 0 InstaLogger.logger().info( f"number of posts to do: {num_of_posts_to_do}") num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12) InstaLogger.logger().info( f"Getting first {num_of_posts_to_scroll} posts but checking {num_of_posts_to_do} posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n" ) while (len(links2) < num_of_posts_to_do): prev_divs = browser.find_elements_by_tag_name('main') links_elems = [ div.find_elements_by_tag_name('a') for div in prev_divs ] links = sum( [[link_elem.get_attribute('href') for link_elem in elems] for elems in links_elems], []) for elems in links_elems: for link_elem in elems: href = link_elem.get_attribute('href') if "/p/" in href: img = link_elem.find_element_by_tag_name('img') src = img.get_attribute('src') preview_imgs[href] = src for link in links: if "/p/" in link: if (len(links2) < num_of_posts_to_do): links2.append(link) links2 = list(set(links2)) InstaLogger.logger().info( f"Scrolling profile {len(links2)} / {num_of_posts_to_scroll}") body_elem.send_keys(Keys.END) sleep(Settings.sleep_time_between_post_scroll) ##remove bellow part to never break the scrolling script before reaching the num_of_posts if (len(links2) == previouslen): breaking += 1 InstaLogger.logger().info( f"breaking in {4 - breaking}...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py" ) else: breaking = 0 if breaking > 3: InstaLogger.logger().info( "Not getting any more posts, ending scrolling") sleep(2) break previouslen = len(links2) ## except NoSuchElementException as err: InstaLogger.logger().error('Something went terribly wrong') return links2, preview_imgs
def extract_followers(browser, username): InstaLogger.logger().info('Extracting follower from ' + username) try: user_link = "https://www.instagram.com/{}".format(username) web_adress_navigator(browser, user_link) except PageNotFound404 as e: raise NoInstaProfilePageFound(e) sleep(5) followers = [] # find number of followers elem = browser.find_element_by_xpath( "//div[@id='react-root']//header[@class='vtbgv ']//ul[@class='k9GMp ']/child::li[2]/a/span" ) elem.click() sleep(15) # remove suggestion list and load 24 list elements after this browser.execute_script( "document.getElementsByClassName('isgrP')[0].scrollTo(0,500)") sleep(10) elems = browser.find_elements_by_xpath( "//body//div[@class='PZuss']//a[@class='FPmhX notranslate _0imsa ']") for i in range(12): val = elems[i].get_attribute('innerHTML') followers.append(val) for i in range(12): browser.execute_script( "document.getElementsByClassName('PZuss')[0].children[0].remove()") isDone = False while 1: try: start = time() browser.execute_script( "document.getElementsByClassName('isgrP')[0].scrollTo(0,document.getElementsByClassName('isgrP')[0].scrollHeight)" ) while 1: try: if int( browser.execute_script( "return document.getElementsByClassName('PZuss')[0].children.length" )) == 24: break except (KeyboardInterrupt, SystemExit): # f.close() raise except: continue if time() - start > 10: isDone = True break if isDone: break elems = browser.find_elements_by_xpath( "//body//div[@class='PZuss']//a[@class='FPmhX notranslate _0imsa ']" ) list_segment = "" for i in range(12): val = elems[i].get_attribute('innerHTML') list_segment += (val + '\n') followers.append(val) for i in range(12): browser.execute_script( "document.getElementsByClassName('PZuss')[0].children[0].remove()" ) InstaLogger.logger().info(time() - start) except (KeyboardInterrupt, SystemExit): # f.close() raise except: continue list_segment = "" elems = browser.find_elements_by_xpath( "//body//div[@class='PZuss']//a[@class='FPmhX notranslate _0imsa ']") for i in range(len(elems)): val = elems[i].get_attribute('innerHTML') list_segment += (val + '\n') followers.append(val) return followers
def extract_user_posts(browser, num_of_posts_to_do): """Get all posts from user""" links = [] links2 = [] preview_imgs = {} # list links contains 30 links from the current view, as that is the maximum Instagram is showing at one time # list links2 contains all the links collected so far # preview_imgs dictionary maps link in links2 to link's post's preview image src try: body_elem = browser.find_element_by_tag_name('body') # load_button = body_elem.find_element_by_xpath\ # ('//a[contains(@class, "_1cr2e _epyes")]') # body_elem.send_keys(Keys.END) # sleep(3) previouslen = 0 breaking = 0 print("number of posts to do: ", num_of_posts_to_do) num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12) print( "Getting first", num_of_posts_to_scroll, "posts but checking ", num_of_posts_to_do, " posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n" ) while (len(links2) < num_of_posts_to_do): prev_divs = browser.find_elements_by_tag_name('main') links_elems = [ div.find_elements_by_tag_name('a') for div in prev_divs ] links = sum( [[link_elem.get_attribute('href') for link_elem in elems] for elems in links_elems], []) for elems in links_elems: for link_elem in elems: href = link_elem.get_attribute('href') try: if "/p/" in href: try: img = link_elem.find_element_by_tag_name('img') src = img.get_attribute('src') preview_imgs[href] = src except NoSuchElementException: print("img exception 132") continue except Exception as err: print(err) for link in links: if "/p/" in link: if (len(links2) < num_of_posts_to_do): links2.append(link) links2 = list(set(links2)) print("Scrolling profile ", len(links2), "/", num_of_posts_to_scroll) body_elem.send_keys(Keys.END) sleep(Settings.sleep_time_between_post_scroll) ##remove bellow part to never break the scrolling script before reaching the num_of_posts if (len(links2) == previouslen): breaking += 1 print( "breaking in ", 4 - breaking, "...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py" ) else: breaking = 0 if breaking > 3: print("Not getting any more posts, ending scrolling") sleep(2) break previouslen = len(links2) ## except NoSuchElementException as err: InstaLogger.logger().error('Something went terribly wrong') post_infos = [] counter = 1 # into user_commented_total_list I will add all username links who commented on any post of this user user_commented_total_list = [] for postlink in links2: print("\n", counter, "/", len(links2)) counter = counter + 1 try: caption, location_url, location_name, location_id, lat, lng, imgs, imgdesc, tags, likes, commentscount, date, user_commented_list, user_comments, mentions, user_liked_post, views = extract_post_info( browser, postlink) location = { 'location_url': location_url, 'location_name': location_name, 'location_id': location_id, 'latitude': lat, 'longitude': lng, } post_infos.append({ 'caption': caption, 'location': location, 'imgs': imgs, 'imgdesc': imgdesc, 'preview_img': preview_imgs.get(postlink, None), 'date': date, 'tags': tags, 'likes': { 'count': likes, 'list': user_liked_post }, 'views': views, 'url': postlink, 'comments': { 'count': commentscount, 'list': user_comments }, 'mentions': mentions }) user_commented_total_list = user_commented_total_list + user_commented_list except NoSuchElementException as err: InstaLogger.logger().error( "Could not get information from post: " + postlink) InstaLogger.logger().error(err) except: InstaLogger.logger().error( "Could not get information from post: " + postlink) return post_infos, user_commented_total_list
def get_user_info(browser, username): """Get the basic user info from the profile screen""" num_of_posts = 0 followers = {'count': 0} following = {'count': 0} prof_img = "" bio = "" bio_url = "" alias = "" container = browser.find_element_by_class_name('v9tJq') isprivate = False try: if container.find_element_by_class_name('Nd_Rl'): isprivate = True except: isprivate = False try: alias = container.find_element_by_class_name( '-vDIg').find_element_by_tag_name('h1').text except: InstaLogger.logger().info("alias is empty") try: bio = container.find_element_by_class_name('-vDIg') \ .find_element_by_tag_name('span').text except: InstaLogger.logger().info("Bio is empty") try: bio_url = container.find_element_by_class_name('yLUwa').text except: InstaLogger.logger().info("Bio Url is empty") try: img_container = browser.find_element_by_class_name('RR-M-') prof_img = img_container.find_element_by_tag_name('img').get_attribute( 'src') except: InstaLogger.logger().info("image is empty") try: infos = container.find_elements_by_class_name('Y8-fY') try: num_of_posts = extract_exact_info(infos[0]) except: InstaLogger.logger().error("Number of Posts empty") try: following = {'count': extract_exact_info(infos[2])} except: InstaLogger.logger().error("Following is empty") try: followers = {'count': extract_exact_info(infos[1])} try: if Settings.scrape_follower == True: if isprivate == True: InstaLogger.logger().info( "Cannot get Follower List - private account") else: followers['list'] = extract_followers( browser, username) except Exception as exception: # Output unexpected Exceptions. print("Unexpected error:", sys.exc_info()[0]) print(exception) InstaLogger.logger().error("Cannot get Follower List") except: InstaLogger.logger().error("Follower is empty") except: InstaLogger.logger().error("Infos (Following, Abo, Posts) is empty") information = { 'alias': alias, 'username': username, 'bio': bio, 'prof_img': prof_img, 'num_of_posts': num_of_posts, 'followers': followers, 'following': following, 'bio_url': bio_url, 'isprivate': isprivate, } InstaLogger.logger().info("alias name: " + information['alias']) InstaLogger.logger().info("bio: " + information['bio']) InstaLogger.logger().info("url: " + information['bio_url']) InstaLogger.logger().info("Posts: " + str(information['num_of_posts'])) InstaLogger.logger().info("Follower: " + str(information['followers']['count'])) InstaLogger.logger().info("Following: " + str(information['following'])) InstaLogger.logger().info("isPrivate: " + str(information['isprivate'])) return information
def extract_information(browser, username, limit_amount): InstaLogger.logger().info('Extracting information from ' + username) """Get all the information for the given username""" isprivate = False try: user_link = "https://www.instagram.com/{}/".format(username) web_adress_navigator(browser, user_link) except PageNotFound404 as e: raise NoInstaProfilePageFound(e) num_of_posts_to_do = 999999 alias_name = '' bio = '' prof_img = '' num_of_posts = 0 followers = 0 following = 0 bio_url = '' try: alias_name, bio, prof_img, num_of_posts, followers, following, bio_url, isprivate = get_user_info(browser) if limit_amount < 1: limit_amount = 999999 num_of_posts_to_do = min(limit_amount, num_of_posts) except Exception as err: InstaLogger.logger().error("Couldn't get user profile. - Terminating") quit() prev_divs = browser.find_elements_by_class_name('_70iju') post_infos = [] user_commented_total_list = [] if Settings.scrap_posts_infos is True and isprivate is False: try: post_infos, user_commented_total_list = extract_user_posts(browser, num_of_posts_to_do) except: InstaLogger.logger().error("Couldn't get user posts.") information = { 'alias': alias_name, 'username': username, 'bio': bio, 'prof_img': prof_img, 'num_of_posts': num_of_posts, 'followers': followers, 'following': following, 'bio_url': bio_url, 'isprivate': isprivate, 'scrapped': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'posts': post_infos, } InstaLogger.logger().info("User " + username + " has " + str(len(user_commented_total_list)) + " comments.") # sorts the list by frequencies, so users who comment the most are at the top import collections from operator import itemgetter, attrgetter counter = collections.Counter(user_commented_total_list) com = sorted(counter.most_common(), key=itemgetter(1, 0), reverse=True) com = map(lambda x: [x[0]] * x[1], com) user_commented_total_list = [item for sublist in com for item in sublist] # remove duplicates preserving order (that's why not using set()) user_commented_list = [] last = '' for i in range(len(user_commented_total_list)): if username.lower() != user_commented_total_list[i]: if last != user_commented_total_list[i]: user_commented_list.append(user_commented_total_list[i]) last = user_commented_total_list[i] return information, user_commented_list
def get_user_info(browser): """Get the basic user info from the profile screen""" num_of_posts = 0 followers = 0 following = 0 prof_img = "" bio = "" bio_url = "" alias_name = "" container = browser.find_element_by_class_name('v9tJq') isprivate = False try: infos = container.find_elements_by_class_name('Y8-fY') num_of_posts = extract_exact_info(infos[0]) followers = extract_exact_info(infos[1]) following = extract_exact_info(infos[2]) except: InstaLogger.logger().error("Infos (Follower, Abo, Posts) is empty") infos = "" try: alias_name = container.find_element_by_class_name('-vDIg').find_element_by_tag_name('h1').text except: InstaLogger.logger().info("alias is empty") try: bio = container.find_element_by_class_name('-vDIg') \ .find_element_by_tag_name('span').text except: InstaLogger.logger().info("Bio is empty") try: bio_url = container.find_element_by_class_name('yLUwa').text except: InstaLogger.logger().info("Bio Url is empty") try: img_container = browser.find_element_by_class_name('RR-M-') prof_img = img_container.find_element_by_tag_name('img').get_attribute('src') except: InstaLogger.logger().info("image is empty") try: if container.find_element_by_class_name('Nd_Rl'): isprivate = True except: isprivate = False InstaLogger.logger().info("alias name: " + alias_name) InstaLogger.logger().info("bio: " + bio) InstaLogger.logger().info("url: " + bio_url) InstaLogger.logger().info("Posts: " + str(num_of_posts)) InstaLogger.logger().info("Follower: " + str(followers)) InstaLogger.logger().info("Following: " + str(following)) InstaLogger.logger().info("isPrivate: " + str(isprivate)) return alias_name, bio, prof_img, num_of_posts, followers, following, bio_url, isprivate
def extract_post_comments(browser, post): # if more than 22 comment elements, use the second to see # how much comments, else count the li's # first element is the text, second either the first comment # or the button to display all the comments comments = [] user_commented_list = [] user_comments = [] try: if post.find_elements_by_tag_name('ul'): comment_list = post.find_element_by_tag_name('ul') comments = comment_list.find_elements_by_tag_name('li') if len(comments) > 1: # load hidden comments tried_catch_comments = 0 while (comments[1].text.lower() == 'load more comments' or comments[1].text.lower().startswith('view all')): try: if comments[1].find_element_by_tag_name('button'): print("click button for loading more") comments[1].find_element_by_tag_name( 'button').click() elif comments[1].find_element_by_tag_name('a'): print("click a for loading more") comments[1].find_element_by_tag_name('a').click() sleep(Settings.sleep_time_between_comment_loading) except: print("error on clicking - next try (tried: " + str(tried_catch_comments) + ") comments:" + str(len(comments)) + ")") tried_catch_comments = tried_catch_comments + 1 if tried_catch_comments > 10: print("exit getting comments") break sleep(Settings.sleep_time_between_comment_loading) comment_list = post.find_element_by_tag_name('ul') comments = comment_list.find_elements_by_tag_name('li') # adding who commented into user_commented_list InstaLogger.logger().info("found comments: " + str(len(comments))) else: print("found comment: 1") for comm in comments: try: user_commented = comm.find_element_by_tag_name( 'a').get_attribute("href").split('/') user_commented_list.append(user_commented[3]) except: InstaLogger.logger().error( "ERROR something went wrong getting user_commented") # first comment has to be loaded everytime to get the caption and tag from post if (Settings.output_comments is True or len(user_comments) < 1): user_comment = {} try: user_comment = { 'user': user_commented[3], 'comment': comm.find_element_by_tag_name('span').text } InstaLogger.logger().info( user_commented[3] + " -- " + comm.find_element_by_tag_name('span').text) user_comments.append(user_comment) except: InstaLogger.logger().error( "ERROR something went wrong getting comment") InstaLogger.logger().info(str(len(user_commented_list)) + " comments.") except BaseException as e: InstaLogger.logger().error(e) except: InstaLogger.logger().error("Error - getting comments") return user_comments, user_commented_list, int(len(comments) - 1)
def extract_post_info(browser, postlink): """Get the information from the current post""" try: InstaLogger.logger().info("Scrapping Post Link: " + postlink) web_adress_navigator(browser, postlink) except PageNotFound404 as e: raise NoInstaPostPageFound(e) except NoSuchElementException as err: InstaLogger.logger().error("Could not get information from post: " + postlink) InstaLogger.logger().error(err) pass post = browser.find_element_by_class_name('ltEKP') date = '' # Get caption caption = '' username = '' try: username = post.find_element_by_class_name('e1e1d').text except: InstaLogger.logger().error("ERROR - getting Post infos (username) ") # Get location details location_url = '' location_name = '' location_id = 0 lat = '' lng = '' imgs = [] img = '' try: # Location url and name location_div = post.find_element_by_class_name( 'M30cS').find_elements_by_tag_name('a') if location_div: location_url = location_div[0].get_attribute('href') location_name = location_div[0].text # Longitude and latitude location_id = location_url.strip( 'https://www.instagram.com/explore/locations/').split('/')[0] url = 'https://www.instagram.com/explore/locations/' + location_id + '/?__a=1' response = requests.get(url) data = response.json() lat = data['graphql']['location']['lat'] lng = data['graphql']['location']['lng'] InstaLogger.logger().info("location_id: " + location_id) InstaLogger.logger().info("location_url: " + location_url) InstaLogger.logger().info("location_name: " + location_name) InstaLogger.logger().info("lat: " + lat) InstaLogger.logger().info("lng: " + lng) except: InstaLogger.logger().error("getting Location Infos (perhaps not set)") try: date = post.find_element_by_xpath('//a/time').get_attribute("datetime") InstaLogger.logger().info("Post date: " + str(date)) except: InstaLogger.logger().error("ERROR - getting Post Date ") try: imgs = post.find_elements_by_tag_name('img') if len(imgs) >= 2: img = imgs[1].get_attribute('src') else: img = imgs[0].get_attribute('src') InstaLogger.logger().info("post image: " + img) except: InstaLogger.logger().error("ERROR - Post Image ") likes = 0 try: #if len(post.find_elements_by_xpath('//article/div/section')) > 2: likes_element = post.find_elements_by_xpath( '//article/div[2]/section[2]/div/div/a/span') if len(likes_element) > 1: likes = str(likes_element[1].text) else: likes = str(likes_element[0].text) likes = likes.replace(',', '').replace('.', '') likes = likes.replace('k', '00') InstaLogger.logger().info("post likes: " + likes) except Exception as err: InstaLogger.logger().error("ERROR - Getting Post Likes") InstaLogger.logger().error(err) # if likes is not known, it would cause errors to convert empty string to int try: likes = int(likes) except Exception as err: InstaLogger.logger().error( "ERROR - Extracting number of likes failed. Saving likes as -1") InstaLogger.logger().error(err) likes = -1 user_comments = [] user_commented_list = [] user_liked_list = [] mentions = [] tags = [] caption = '' commentscount = 0 try: user_comments, user_commented_list, commentscount = extract_post_comments( browser, post) except: InstaLogger.logger().error( "ERROR - getting Post comments function trying") try: caption, tags = extract_post_caption(user_comments, username) # delete first comment because its the caption of the user posted if len(caption) > 0: user_comments.pop(0) except: InstaLogger.logger().error( "ERROR - getting Post caption/tags function") try: mentions = extract_post_mentions(browser, post) except: InstaLogger.logger().error("ERROR - getting Post Mentions function") try: user_liked_list = extract_post_likers(browser, post, postlink, likes) except: InstaLogger.logger().error("ERROR - getting Post Likers function") return caption, location_url, location_name, location_id, lat, lng, img, tags, int( likes ), commentscount, date, user_commented_list, user_comments, mentions, user_liked_list
def extract_post_likers(browser, post, postlink, likes): user_liked_list = [] xpath_identifier_user = "******" if (Settings.scrape_posts_likers is False): return user_liked_list else: InstaLogger.logger().info("GETTING LIKERS FROM POST") postlink = postlink + "liked_by/" tried_catch_likers = 0 likers_list_before = 0 try: # post.find_element_by_xpath("//a[contains(@class, 'zV_Nj')]").click() elementToClick = post.find_element_by_xpath( "//a[contains(@class, 'zV_Nj')]") browser.execute_script("arguments[0].click();", elementToClick) sleep(3) # likers_list = post.find_elements_by_xpath("//li[@class='wo9IH']//a[contains(@class, 'FPmhX')]") likers_list = post.find_elements_by_xpath(xpath_identifier_user) print("LÄNGE " + str(len(likers_list)) + "") while len(likers_list) < likes: InstaLogger.logger().info("new likers in actual view: " + str(len(likers_list)) + " - list: " + str(len(user_liked_list)) + " should be " + str(likes) + " -- scroll for more") try: div_likebox_elem = browser.find_element_by_xpath( "//div[contains(@class, 'i0EQd')]/div/div/div[last()]" ) # old:wwxN2 # browser.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", div_likebox_elem) browser.execute_script("arguments[0].scrollIntoView(true);", div_likebox_elem) except BaseException as e: tried_catch_likers = tried_catch_likers + 1 div_likebox_elem = browser.find_element_by_xpath( "//div[contains(@class, 'i0EQd')]/div/div/div[1]") browser.execute_script("arguments[0].scrollIntoView(true);", div_likebox_elem) print("error on scrolling - next try (tried: " + str(tried_catch_likers) + ") Message:" + e) sleep(Settings.sleep_time_between_post_scroll) # likers_list = post.find_elements_by_xpath(" //li[@class='wo9IH']//a[contains(@class, 'FPmhX')]") likers_list = post.find_elements_by_xpath(xpath_identifier_user) for liker in likers_list: user_like = liker.get_attribute("href").split('/') username_liked_post = user_like[3] if username_liked_post not in user_liked_list: user_liked_list.append(username_liked_post) if (likers_list_before == len(user_liked_list)): tried_catch_likers = tried_catch_likers + 1 print("error on scrolling - next try (tried: " + str(tried_catch_likers) + ")") sleep(Settings.sleep_time_between_post_scroll * 1.5) div_likebox_elem = browser.find_element_by_xpath( "//div[contains(@class, 'i0EQd')]/div/div/div[1]") browser.execute_script("arguments[0].scrollIntoView(true);", div_likebox_elem) if tried_catch_likers > 10: InstaLogger.logger().error("exit scrolling likers " + str(tried_catch_likers) + "x tries - liker list: " + str(len(user_liked_list)) + " should be " + str(likes) + "") break likers_list_before = len(user_liked_list) InstaLogger.logger().info('likers: ' + str(len(user_liked_list))) except BaseException as e: InstaLogger.logger().error("Error - getting post likers") InstaLogger.logger().error(e) return user_liked_list
def extract_post_comments(browser, post): # if more than 22 comment elements, use the second to see # how much comments, else count the li's # first element is the text, second either the first comment # or the button to display all the comments # sometimes getting comments ends in a endless loop # therefore reduce the run comments_found_last_run = 0 comments_run_same_length = 0 comments = [] user_commented_list = [] user_comments = [] try: if post.find_elements_by_tag_name('ul'): comment_list = post.find_element_by_tag_name('ul') comments = comment_list.find_elements_by_tag_name('li') if len(comments) > 1: # load hidden comments tried_catch_comments = 0 while (comments[1].text.lower() == 'load more comments' or comments[1].text.lower().startswith('view all')): try: if comments[1].find_element_by_tag_name('button'): print("clicking button for loading more comments") browser.execute_script( "arguments[0].click();", comments[1].find_element_by_tag_name('button')) elif comments[1].find_element_by_tag_name('a'): print("clicking a for loading more") browser.execute_script( "arguments[0].click();", comments[1].find_element_by_tag_name('a')) sleep(Settings.sleep_time_between_comment_loading) comment_list = post.find_element_by_tag_name('ul') comments = comment_list.find_elements_by_tag_name('li') print("comments (loaded: " + str(len(comments)) + "/lastrun: " + str(comments_found_last_run) + ")") if (comments_found_last_run == len(comments)): comments_run_same_length = comments_run_same_length + 1 if comments_run_same_length > 10: InstaLogger.logger().error( "exit getting comments: " + str(comments_run_same_length) + "x same length of comments, perhaps endless loop" ) break else: comments_same_length = 0 comments_found_last_run = len(comments) except: InstaLogger.logger().error( "error clicking - next try (tried: " + str(tried_catch_comments) + ") comments:" + str(len(comments)) + ")") tried_catch_comments = tried_catch_comments + 1 if tried_catch_comments > 10: InstaLogger.logger().error( "exit getting comments, " + str(tried_catch_comments) + "x tried to get comments") break sleep(Settings.sleep_time_between_comment_loading) InstaLogger.logger().info("found comments: " + str(len(comments))) else: print("found comment: 1") # adding who commented into user_commented_list for comm in comments: try: user_commented = comm.find_element_by_tag_name( 'a').get_attribute("href").split('/') user_commented_list.append(user_commented[3]) except: InstaLogger.logger().error( "ERROR something went wrong getting user_commented") # first comment has to be loaded every time to get the caption and tag from post if (Settings.output_comments is True or len(user_comments) < 1): user_comment = {} try: user_comment = { 'user': user_commented[3], 'comment': comm.find_element_by_css_selector( 'h2 + span, h3 + span').text } print(user_comment) InstaLogger.logger().info( user_commented[3] + " -- " + comm.find_element_by_css_selector( 'h2 + span, h3 + span').text) user_comments.append(user_comment) except: InstaLogger.logger().error( "ERROR something went wrong getting comment") InstaLogger.logger().info(str(len(user_commented_list)) + " comments.") except BaseException as e: InstaLogger.logger().error(e) except: InstaLogger.logger().error("Error - getting comments") return user_comments, user_commented_list, int(len(comments) - 1)