def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names): """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile""" page = [] if save_status == 4: page.append(user_id) page += [user_id + s for s in section] for i, _ in enumerate(scan_list): # try: driver.get(page[i]) if save_status != 3: utils.scroll(total_scrolls, driver, selectors, scroll_time, dbid) data = bs(driver.page_source, 'lxml').find_all('div', attrs={"class":"du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"}) if len(data) == 0: driver.refresh() time.sleep(0.5) driver.find_element_by_xpath("//a[contains(text(),'Timeline')]").click() time.sleep(0.5) driver.find_element_by_xpath("//a[contains(text(),'Timeline')]").click() time.sleep(3) data = bs(driver.page_source, 'lxml').find_all('div', attrs={"class":"du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"}) save_to_file(file_names[i], data, save_status, i)
def getTrends(driver): driver.get("https://www.youtube.com/feed/trending") utils.scroll(driver, numScrolls=20) soup = BeautifulSoup(driver.page_source, "html.parser") vids = [] for k in soup.find_all('div', id='grid-container'): vids += k.find_all('ytd-video-renderer') videos = utils.getVideoFromSearch(vids) return videos
def downloadImagesByTag(tag, maxImages, isClosing): driver = webdriver.Firefox() driver.get("https://www.instagram.com/") utils.login(driver, username, password) searchByTag(driver, tag, 1) utils.scroll(driver, 1) #target all the link elements on the page numberOfImagesDownloaded = download.downloadImages(driver, maxImages, tag) print(f'Downloaded {numberOfImagesDownloaded} images') if isClosing: utils.end(driver, 1)
def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names): """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile""" page = [] if save_status == 4: page.append(user_id) page += [user_id + s for s in section] for i, _ in enumerate(scan_list): try: driver.get(page[i]) if ((save_status == 0) or (save_status == 1) or (save_status == 2)): # Only run this for friends, photos and videos # the bar which contains all the sections sections_bar = driver.find_element_by_xpath( selectors.get("sections_bar")) if sections_bar.text.find(scan_list[i]) == -1: continue if save_status != 3: utils.scroll(total_scrolls, driver, selectors, scroll_time, dbid) data = bs(driver.page_source, 'lxml').find_all( 'div', attrs={"class": "du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"}) if len(data) == 0: driver.refresh() time.sleep(0.5) driver.find_element_by_xpath( "//a[contains(text(),'Timeline')]").click() time.sleep(0.5) driver.find_element_by_xpath( "//a[contains(text(),'Timeline')]").click() time.sleep(3) data = bs(driver.page_source, 'lxml').find_all( 'div', attrs={"class": "du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"}) save_to_file(file_names[i], data, save_status, i) except Exception: print( "Exception (scrape_data)", str(i), "Status =", str(save_status), sys.exc_info()[0], )
def scrape_data(url, scan_list, section, elements_path, save_status, file_names): """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile""" page = [] if save_status == 4 or save_status == 5: page.append(url) page += [url + s for s in section] for i, _ in enumerate(scan_list): try: driver.get(page[i]) if ((save_status == 0) or (save_status == 1) or (save_status == 2)): # Only run this for friends, photos and videos # the bar which contains all the sections sections_bar = driver.find_element_by_xpath( selectors.get("sections_bar")) if sections_bar.text.find(scan_list[i]) == -1: continue if save_status != 3: utils.scroll(total_scrolls, driver, selectors, scroll_time) pass data = driver.find_elements_by_xpath(elements_path[i]) save_to_file(file_names[i], data, save_status, i) except Exception: print( "Exception (scrape_data)", str(i), "Status =", str(save_status), sys.exc_info()[0], )
def scrape_data(url, scan_list, section, elements_path, save_status, file_names): """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile""" page = [] pos = -1 if url[-1] == '/': pos = -2 user_name = url.split("/")[pos] print("user_name: ", user_name) if save_status == 4 or save_status == 5: page.append(url) page += [url + s for s in section] for i, _ in enumerate(scan_list): try: driver.get(page[i]) if save_status != 3: utils.scroll(total_scrolls, driver, selectors, scroll_time) pass data = driver.find_elements_by_xpath(elements_path[i]) save_to_file(file_names[i], data, save_status, i, user_name) except Exception: print( "Exception (scrape_data)", str(i), "Status =", str(save_status), sys.exc_info()[0], )
def get_all_page_data(url, is_community=False): name = url.split("/")[-1] if len( url.split("/")[-1]) > 0 else url.split("/")[-2] if is_community: name = os.path.join(name, "community") url = url + "/community" data_path = os.path.join(".", "data") if not os.path.exists(data_path): os.mkdir(data_path) page_data_path = os.path.join(data_path, name) if not os.path.exists(page_data_path): os.mkdir(page_data_path) should_scrape_headless = is_community == False driver = initialize_driver(args.chrome, args.windows, is_headless=should_scrape_headless) driver.get(url) page_name = get_text(driver, './/a[@class="_64-f"]') print(f"Scrolling {url} until {cutoff_date}") scroll(driver, pd.to_datetime(cutoff_date)) posts = driver.find_elements_by_xpath( '//div[contains(@class, "userContentWrapper")]') post_links = [get_post_links(post) for post in tqdm(posts)] post_links = list(set(post_links)) with open(os.path.join(page_data_path, 'post_links.json'), 'w') as f: json.dump(post_links, f) driver.quit() print(f"Now scraping {len(post_links)} posts from {name}") for i, post_link in enumerate(post_links): if not is_string_url(post_link): continue print(f"Scraping {post_link}") driver = initialize_driver(args.chrome, args.windows) driver.get(post_link) if "/videos/" in post_link: post_type = "videos" elif "/photos/" in post_link: post_type = "photos" elif "/posts/" in post_link: post_type = "posts" elif "/notes/" in post_link: post_type = "notes" else: post_type = "other" if post_type == "notes": post_element = driver.find_element_by_xpath( './/div[contains(@class, "fb_content")]') else: post_element = driver.find_element_by_xpath( './/div[contains(@class, "userContentWrapper")]') post_data = get_post_data(driver, post_element, post_type) post_data["page_name"] = page_name with open(os.path.join(page_data_path, f'page_post_{i}.json'), 'w') as f: json.dump(post_data, f) driver.quit() if not is_community: get_all_page_data(url, is_community=True)
def start_crawler(url, email, password, depth): #Check if page has loaded def page_loaded(driver): return driver.find_element_by_tag_name("body") != None # Log and sign into twitter - definde or redefine driver def log_in(): # Opening the web browser and the twitter page driver = webdriver.Chrome( executable_path= r'F:\Dokumentation\Programme\ChromeDriver\chromedriver.exe') driver.get(url) wait = ui.WebDriverWait(driver, 10) wait.until(page_loaded) #Signing in actions = ActionChains(driver) actions.send_keys(email) actions.send_keys(Keys.TAB) actions.send_keys(password) actions.send_keys(Keys.ENTER) actions.perform() time.sleep(5) return driver # Opening the web browser and the twitter page driver = log_in() # Get first node html_doc = driver.page_source soup = BeautifulSoup(html_doc, 'html.parser') # Create Anchor id = len(models.GephiNode.objects.all()) + 1 print("id: ", id) label = soup.find( "a", { "class": "ProfileHeaderCard-nameLink u-textInheritColor js-nav" }).get_text() print("label: ", label) try: fan_count = soup.findAll("span", {"class": "ProfileNav-value"})[2].get_text() except: fan_count = 0 print("fan_count: ", fan_count) handle = soup.find("b", {"class": "u-linkComplex-target"}).get_text() print("handle: ", handle) Anchor = models.GephiNode(id=id, label=label, fan_count=fan_count, handle=handle) Anchor.create() print("Anchor: ", Anchor.id, Anchor.label) # Go to following following driver.get(url + "/following") wait = ui.WebDriverWait(driver, 10) wait.until(page_loaded) try: lnks_cnt = int( soup.findAll("span", {"class": "ProfileNav-value"})[1].get_text().replace( ".", "")) except: lnks_cnt = 1500 print(lnks_cnt) links = driver.find_elements_by_xpath( "//a[@class='ProfileCard-bg js-nav']") print(len(links)) test = 1 while len(links) < lnks_cnt - 2 and test < 1500: utils.scroll(driver) links = driver.find_elements_by_xpath( "//a[@class='ProfileCard-bg js-nav']") print("following: ", lnks_cnt) print("links: ", len(links)) test += 1 utils.scroll(driver) links = driver.find_elements_by_xpath( "//a[@class='ProfileCard-bg js-nav']") newUrlList = list() for lnk in links: newUrlList.append(lnk.get_attribute("href")) print("links unshuffeld: ", newUrlList) shuffle(newUrlList) print("links shuffeld: ", newUrlList) # Check everyone the Anchor follows print("link count: ", len(links)) print("edge count: ", len(models.GephiEdge.objects.filter(source=Anchor.id))) if len(newUrlList) > len( models.GephiEdge.objects.filter(source=Anchor.id)): #First Followed for link in newUrlList: if link == "https://twitter.com/account/suspended": newUrlList = [ x for x in newUrlList if x != "https://twitter.com/account/suspended" ] else: try: driver.get(link) except: driver = log_in() driver.get(link) # Get HTML wait = ui.WebDriverWait(driver, 10) wait.until(page_loaded) html_doc = driver.page_source soup = BeautifulSoup(html_doc, 'html.parser') #Create Node id = len(models.GephiNode.objects.all()) + 1 print("id: ", id) label = soup.find( "a", { "class": "ProfileHeaderCard-nameLink u-textInheritColor js-nav" }).get_text() print("label: ", label) try: fan_count = soup.findAll( "span", {"class": "ProfileNav-value"})[2].get_text() except: fan_count = 0 print("fan_count: ", fan_count) handle = soup.find("b", { "class": "u-linkComplex-target" }).get_text() print("handle: ", handle) tempNode = models.GephiNode(id=id, label=label, fan_count=fan_count, handle=handle) tempNode.create() #Create Edge print("source: ", Anchor.id) print("target: ", tempNode.id) print("type: ", "Directed") edge_id = len(models.GephiEdge.objects.all()) print("id: ", edge_id) firstEdge = models.GephiEdge(source=Anchor.id, target=tempNode.id, type="Directed", id=edge_id, weight=1) firstEdge.create() # Break if list is complete in DB print("link count: ", len(links)) print("edge count: ", len(models.GephiEdge.objects.filter(source=Anchor.id))) if len(newUrlList) == len( models.GephiEdge.objects.filter(source=Anchor.id)): break print("Nodes: ", len(models.GephiNode.objects.all())) print("Edges: ", len(models.GephiEdge.objects.all())) print( "Difference: ", len(models.GephiEdge.objects.all()) - len(models.GephiNode.objects.all())) driver.close() # Rekcursion print("depth: ", depth) if depth > 0: for uri in newUrlList: try: start_crawler(uri, email, password, depth - 1) except: try: start_crawler(uri, email, password, depth - 1) except: pass
def open_newsfeed(self): self.driver.get(self.url) scroll(self.driver, 5)
def twi_hashtag_originator(hashtag, minYear=2006): # cache the hashtag for future use (screenshot) cache = "_".join(hashtag.split(" ")).replace("#", "") # checking wether argument is valid if hashtag is None or hashtag == "": print("Wrong argument: ", hashtag) return None # separate the words and remove # h = [j for j in hashtag.split(" ") if j != ""] check = lambda x: x[1:] if x.startswith("#") else x h = [check(j).lower() for j in h] # forming the URL part hashtag = f"(%23{h[0]}" for rest in h[1:]: hashtag += f" AND %23{rest}" hashtag += ")" # preliminary code soup = BeautifulSoup("", "html.parser") createStorage("twitter") screen = False # close driver as soon as work done with setDriver(headless=False) as driver: wait = WebDriverWait(driver, 3) year = minYear # checking if hashtag is actually ever been used url = f"https://twitter.com/search?q={hashtag}" driver.get(url) articles = returnAllTweets(wait) if len(articles) == 0: print("Hashtag has not been used in any tweets that are public") return None # check starting from minYear articles = [] while (len(articles) == 0): url = f"https://twitter.com/search?q={hashtag}%20until%3A{str(year)}-01-01&src=typed_query" driver.get(url) articles = returnAllTweets(wait) year += 1 scroll(driver, fastScroll=False) ti = returnAllTweets(wait) if len(ti) == 0: print("Something went wrong.") return None # remove tweets not containing the hashtag (This occurs if the tweet is a parent tweet of the hashtag containing tweet) t = [] for post in ti: inner = post.get_attribute("innerHTML").lower() good = True for each in h: if f">#{each}</a>" not in inner: good = False break if good: t.append(post) # sort all posts by retrieving the timestamp timed = [] for post in t: try: s = post.find_element_by_xpath('.//time') s = s.get_attribute("datetime") if s: try: timed.append({ "timestamp": datetime.datetime.strptime( s, "%Y-%m-%dT%H:%M:%S.%fZ"), "elem": post }) except: pass except NoSuchElementException: pass t = sorted(timed, key=lambda x: x["timestamp"]) # element of earliest post selected and screenshot taken if len(t) == 0: print("Something is seriously wrong") return None t = t[0]["elem"] try: with open(f"data/twitter/{cache}.png", "wb") as filex: filex.write(t.screenshot_as_png) screen = True except: print("Screenshot was unsuccessfull") t = t.get_attribute("innerHTML") soup = BeautifulSoup(t, "html.parser") result = { "poster": { "full_name": "", "username": "", "url": "", "id": "", "profile_image_url": "" }, "timestamp": "", "post_link": "", "post_text": "", "embed": "", "screenshot": "" } a = soup.find("a") if a: result["poster"]["url"] = completeTwitterLink(a.get('href', "")) result["poster"]["username"] = result["poster"]["url"].replace( "https://www.twitter.com/", "") k = a.findNext("a", {"href": a.get("href", "")}) if k: result["poster"]["full_name"] = k.text.partition("@")[0] k = a.find("img") if k: result["poster"]["profile_image_url"] = k.get("src", "") a = soup.find(lambda tag: tag.name == 'a' and tag.find("time")) if a: link = completeTwitterLink(a.get("href", "")) result["post_link"] = link result[ "embed"] = f'<blockquote class="twitter-tweet"><a href="{link}"></a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>' k = a.find("time") if k: result["timestamp"] = k.get("datetime", "") soup = BeautifulSoup(str(soup).partition(str(a))[2], "html.parser") result["post_text"] = " ".join([s.text for s in soup.find_all("span")]) if screen: result["screenshot"] = os.getcwd() + f"/data/twitter/{cache}" return result
def getPosts(self): # navigating to page newUrl = self.url + "/community" if self.driver.current_url != newUrl: self.driver.get(newUrl) utils.scroll(self.driver) sleep(1) content = str(self.driver.page_source) # open comments comm = re.findall(r'View all [0-9]+ comments', content) comm += re.findall(r'View comment', content) for co in comm: elements = self.driver.find_elements_by_xpath( "//*[text()[contains(.,'" + co + "')]]") for element in elements: self.driver.execute_script("arguments[0].click();", element) try: self.wait.until( EC.visibility_of_all_elements_located( (By.TAG_NAME, 'ytd-comment-thread-renderer'))) except TimeoutException: pass sleep(1) soup = BeautifulSoup(self.driver.page_source, "html.parser") blocks = soup.find_all('ytd-backstage-post-thread-renderer') posts = [] # extracting post one by one for block in blocks: post = self.initiatePostSkeleton() # extracting author link and picture a = block.find('div', id='author-thumbnail') if a: a = a.find('a') if a: post["post_author"]["link"] = utils.completeYoutubeLink( a.get('href', '')) img = a.find('img') if img: post["post_author"]["media_directory"] = img.get( 'src', "") # extracting author name a = block.find('a', id='author-text') if a: post["post_author"]["name"] = utils.beautifyText(a.text) # extracting post link and pubish time a = block.find('yt-formatted-string', id='published-time-text') if a: a = a.find('a') if a: post["timestamp"] = a.text post["post_link"] = utils.completeYoutubeLink( a.get('href', '')) # extracting text a = block.find('yt-formatted-string', id='content-text') if a: post["post_type"]["is_text"] = True post["post_text"] = a.text.replace('\ufeff', '') # extracting vote data v = block.find('ytd-backstage-poll-renderer', id='poll-attachment') if v: a = v.find('yt-formatted-string', id='vote-info') if a: count = a.text.split()[0] if count.isnumeric(): post["votes"]["total_votes"] = int(count) post["post_type"]["is_vote"] = True post["votes"]["vote_text"] = post["post_text"] a = v.find_all('yt-formatted-string', {'class': 'choice-text'}) for choice in a: post["votes"]["vote_options"].append(choice.text) # getting likes a = block.find('span', id='vote-count-middle') if a: count = utils.beautifyText(a.text) if count.isnumeric(): post["post_likes"] = int(count) # getting links a = [] srcs = [] b = block.find('div', id='content') c = block.find('div', id='content-attachment') for y in [b, c]: if y: a += y.find_all('a') srcs += y.find_all('img') for link in a: link = utils.completeYoutubeLink(link.get('href', "")) if "/watch?" in link: post["post_type"]["is_video"] = True post["video_links"].append(link) elif link not in [ post["post_link"], self.url, '', self.origin ]: post["other_links"].append(link) for img in srcs: post["post_type"]["is_picture"] = True src = img.get('src', '') if src != '': post["picture_links"].append(src) # removing duplicate links for option in ["picture_links", "video_links", "other_links"]: post[option] = list(set(post[option])) # getting comments for com in block.find_all('ytd-comment-thread-renderer'): comment = self.initiateCommentSkeleton() a = com.find('div', id='author-thumbnail') if a: a = a.find('a') if a: comment["comment_author"][ "link"] = utils.completeYoutubeLink( a.get('href', '')) img = a.find('img') if img: comment["comment_author"][ "media_directory"] = img.get('src', "") a = com.find('a', id='author-text') if a: comment["comment_author"]["name"] = utils.beautifyText( a.text) a = com.find('yt-formatted-string', {'class': 'published-time-text'}) if a: a = a.find('a') if a: comment["timestamp"] = a.text comment["comment_url"] = utils.completeYoutubeLink( a.get('href', '')) a = com.find('yt-formatted-string', id='content-text') if a: comment["comment_text"] = a.text.replace('\ufeff', '') for link in a.find_all('a'): comment["comment_links"].append( utils.completeYoutubeLink(link.get('href', ''))) a = com.find('span', id='vote-count-middle') if a: count = utils.beautifyText(a.text) if count.isnumeric(): comment["comment_likes"] = int(count) post["post_comments"].append(comment) posts.append(post) self.profile["posts"] = posts return posts