# find search input and search for term search_input = driver.find_element_by_xpath('//input[@aria-label="Search query"]') search_input.send_keys(search_term) search_input.send_keys(Keys.RETURN) sleep(1) # navigate to historical 'latest' tab driver.find_element_by_link_text('Latest').click() # get all tweets on the page tweet_ids = set() last_position = driver.execute_script("return window.pageYOffset;") scrolling = True while scrolling: page_cards = driver.find_elements_by_xpath('//div[@data-testid="tweet"]') for card in page_cards[-15:]: if card is not None: tweet = get_tweet_data(card) if tweet is not None: tweetL = list(tweet) tweetL.append(x) tweetL.append(j) tweet = tuple(tweetL) if tweet: if tweet[2]<cutoff_date: scrolling=False else: idtweet = list(tweet) idtweet[2]=str(idtweet[2]) tweet_id = ''.join(idtweet)
class ReservationEngine: def __init__(self, email, password, headless=True): self.email = email self.password = password self.available = False self.booked = False self.reservations_left = False options = EdgeOptions() options.add_argument("--log-level=3") options.use_chromium = True if headless: options.add_argument("headless") self.driver = Edge(options=options) print("Starting web driver...") def remove_overlay(self): #get rid of cc overlay buttons = self.driver.find_elements_by_css_selector("a.cc-btn") while any(map(lambda x: x.size["height"] != 0, buttons)): for button in buttons: try: button.click() except: pass buttons = self.driver.find_elements_by_css_selector("a.cc-btn") def login(self): #login print("Logging in") self.driver.get( "https://account.ikonpass.com/en/login?redirect_uri=/en/myaccount/add-reservations/" ) self.remove_overlay() email_box = self.driver.find_element_by_css_selector("input#email") email_box.send_keys(self.email) password_box = self.driver.find_element_by_css_selector( "input#sign-in-password") password_box.send_keys(self.password) submit = self.driver.find_element_by_css_selector("button.submit") submit.click() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'input.react-autosuggest__input'))) print("Logged in") def refresh(self): self.driver.refresh() def find_date(self, date, resort): WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'input.react-autosuggest__input'))) self.remove_overlay() #select resort search = self.driver.find_element_by_css_selector( "input.react-autosuggest__input") search.send_keys(resort) button = self.driver.find_element_by_css_selector( "li#react-autowhatever-resort-picker-section-1-item-0") button.click() button = self.driver.find_element_by_xpath( "//span[contains(text(), 'Continue')]") button.click() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.DayPicker-wrapper'))) self.remove_overlay() #select date datepicker = self.driver.find_element_by_css_selector( "div.DayPicker-wrapper") month_selected = False while not month_selected: month_text = calendar.month_name[date.month] month = datepicker.find_elements_by_xpath( "//span[contains(text(), " + "'" + month_text + "')]") if len(month) > 0: month_selected = True else: button = datepicker.find_element_by_class_name( "icon-chevron-right") button.click() day = datepicker.find_element_by_xpath("//div[@aria-label='" + date.strftime("%a %b %d %Y") + "']") day.click() day_classes = day.get_attribute(name="class") self.available = "past" not in day_classes and "unavailable" not in day_classes self.booked = "confirmed" in day_classes div = self.driver.find_elements_by_xpath( "//div[contains(text(), 'Reservation Limit Reached')]") self.reservations_left = len(div) == 0 print("Date Selected: " + date.strftime("%m/%d/%Y")) def reserve(self): #confirm reservation if available if self.available and not self.booked and self.reservations_left: self.remove_overlay() button = self.driver.find_element_by_xpath( "//span[contains(text(), 'Save')]") button.click() button = self.driver.find_element_by_xpath( "//span[contains(text(), 'Continue to Confirm')]") button.click() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, "//input[@type='checkbox']"))) button = self.driver.find_element_by_xpath( "//input[@type='checkbox']") button.click() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, "//span[contains(text(), 'Confirm Reservations')]"))) button = self.driver.find_element_by_xpath( "//span[contains(text(), 'Confirm Reservations')]") button.click() self.booked = True print("Booked") return self.booked def log_results(self, log_file_name): #log with open(log_file_name, "a") as f: f.write(datetime.now().strftime("%m/%d/%Y, %H:%M:%S")) f.write(": Available - %r, Booked - %r, Reservations Left- %r" % (self.available, self.booked, self.reservations_left)) f.write("\n") def close_driver(self): self.driver.close()
edge_options.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64)\ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63"' ) #edge_options.binary_location = executable_path driver = Edge(executable_path=executable_path, options=edge_options) driver.maximize_window() driver.get(url) #sleep(1) # 自动输入学号和密码 driver.find_elements_by_xpath( '//*[@id="app"]/div/div[3]/div[1]/input')[0].send_keys( your_id) driver.find_elements_by_xpath( '//*[@id="app"]/div/div[3]/div[2]/input')[0].send_keys( your_password) # driver.find_elements_by_xpath('//*[@id="app"]/div/div[3]/button')[0].click() for i in range(10): try: # 获取验证码 处理验证码 print('第%i次尝试' % (i + 1)) driver.find_elements_by_xpath( '//*[@id="app"]/div/div[3]/div[4]')[0].click() sleep(0.5)
options = EdgeOptions() options.use_chromium = True unpacked_extension_path = os.path.join(os.getcwd(), "markdown-clipper") options.add_argument("--load-extension={}".format(unpacked_extension_path)) download_path = os.path.join(os.getcwd(), "Markdown Output\\") prefs = { "download.default_directory": download_path, "profile.default_content_settings.popups": 0, "directory_upgrade": True } options.add_experimental_option("prefs", prefs) options.add_experimental_option("detach", True) driver = Edge(options=options) url = "https://networklessons.com/cisco/ccna-200-301" driver.get(url) # In[43]: agree = driver.find_elements_by_xpath('//*[@id="catapult-cookie-bar"]/div/div') join = driver.find_elements_by_xpath( '//*[@id="om-mvhsujbebu4nqhlzcsgs-optin"]/div/button') if agree: agree[0].click() if join: join[0].click() # In[ ]:
class Web_scraping: def __init__(self): '''Initialize the application''' #As using the standard webdriver was giving warnings and messing up the terminal, I used the code below to show just what I want. self.opt = EdgeOptions() self.opt.add_experimental_option('excludeSwitches', ['enable-logging']) self.opt.add_argument("--start-maximized") self.opt.use_chromium = True self.driver = Edge( executable_path= r"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe", options=self.opt) def games_link(self): '''Create a list with all season event's link and then create another list with all event's link''' #Creating list with the all season's link self.season_pages_list = [] for y in range(2008, 2022): #Creating the seasons links as str and adding it to a list self.season_link = 'https://www.worldsurfleague.com/events/' + str( y) + '/mct?all=1' self.season_pages_list.append(self.season_link) #Creating a list with the all event's link from each season self.events_link_list = [] for link in self.season_pages_list: self.driver.get(link) #Getting all the events links as selenium format self.event_links = self.driver.find_elements_by_xpath( '//a[@class="event-schedule-details__event-name"]') #Finding the class status completed is needed once it's possible to stop the process on it. self.event_status = self.driver.find_elements_by_xpath( '//span[@class="event-status event-status--completed"]') #Creating event's link list for i in range(0, len(self.event_status)): #Getting the links for each event as a str format self.link_attribute = self.event_links[i].get_attribute('href') self.events_link_list.append(self.link_attribute) with open('events.txt', 'w') as f: for item in self.events_link_list: f.write("%s\n" % item) print('FINISHED') #Getting data inside which event def event_stats(self): #TXT file with all events link to list self.events_link = [ line[0] for line in pd.read_fwf('events.txt', header=None).values.tolist() ] #for link in self.events_link: self.driver.get(self.events_link[0]) #list of all heats self.all_heats_lists = [] while True: #Gets all the waves scores, athletes, nationalities and heats on the page as list. self.waves = self.driver.find_elements_by_xpath( '//*[@class="score"]') self.athletes = self.driver.find_elements_by_xpath( '//*[@class="athlete-name"]') self.nationalities = self.driver.find_elements_by_xpath( '//*[@class="athlete-country-flag"]') self.heat = self.driver.find_elements_by_xpath( '//*[@class="new-heat-hd-name"]') #Gets the round name self.round = self.driver.find_elements_by_xpath( '//*[@class="carousel-item is-selected"]') if len(self.round) == 0: self.round = self.driver.find_elements_by_xpath( '//*[@class="carousel-item last is-selected"]') #Gets the number of surfers and heats on the round, such as the avg surfers per heat (must be 2 or 3) self.number_of_surfers = int(len(self.waves) / 18) #As the final round only has 1 heat, the find_element_by_class_name gets a 'WebDriver' element and not a list self.number_of_heats = len(self.heat) self.surfers_per_heat = int(self.number_of_surfers / self.number_of_heats) #there's a count to deduct 1 stage and gets the round name for each round. self.count = 0 #Gets the stats for each heat self.heat_data = [] for g in range(0, self.number_of_heats): #Page stats #Event stats self.event_turn = self.driver.find_element_by_class_name( 'event-meta-tour-info').text.split()[2][1:] self.event_period = self.driver.find_element_by_class_name( 'event-schedule__date-range').text self.event_name = self.driver.find_element_by_class_name( 'event-title').text.split('\n')[0] self.event_local = re.split( r'(\d+)', self.driver.find_element_by_class_name( 'event-meta-tour-info').text)[2] self.avg_wave_score = re.split( r'(\d+\.\d+)', self.driver.find_element_by_class_name( 'new-heat-hd-status').text)[1] #Heat's id for the database self.heat_id = (f'heat{g + 1}' + self.round[0].text + self.event_turn + self.event_period[-4:]).lower() #Surfer stats self.surfer1 = self.athletes[g * 2].text self.surfer1_nat = self.nationalities[g * 2].get_attribute('title') self.surfer1_best_w1 = self.waves[g * 18 + (1 - 1)].text self.surfer1_best_w2 = self.waves[g * 18 + (2 - 1)].text self.surfer1_total = self.waves[g * 18 + (3 - 1)].text self.surfer1_w01 = self.waves[g * 18 + (4 - 1)].text self.surfer1_w02 = self.waves[g * 18 + (5 - 1)].text self.surfer1_w03 = self.waves[g * 18 + (6 - 1)].text self.surfer1_w04 = self.waves[g * 18 + (7 - 1)].text self.surfer1_w05 = self.waves[g * 18 + (8 - 1)].text self.surfer1_w06 = self.waves[g * 18 + (9 - 1)].text self.surfer1_w07 = self.waves[g * 18 + (10 - 1)].text self.surfer1_w08 = self.waves[g * 18 + (11 - 1)].text self.surfer1_w09 = self.waves[g * 18 + (12 - 1)].text self.surfer1_w10 = self.waves[g * 18 + (13 - 1)].text self.surfer1_w11 = self.waves[g * 18 + (14 - 1)].text self.surfer1_w12 = self.waves[g * 18 + (15 - 1)].text self.surfer1_w13 = self.waves[g * 18 + (16 - 1)].text self.surfer1_w14 = self.waves[g * 18 + (17 - 1)].text self.surfer1_w15 = self.waves[g * 18 + (18 - 1)].text #Surfer 2 stats self.surfer2 = self.athletes[g * 2 + 1].text self.surfer2_nat = self.nationalities[g * 2 + 1].get_attribute('title') self.surfer2_best_w1 = self.waves[g * 18 + (19 - 1)].text self.surfer2_best_w2 = self.waves[g * 18 + (20 - 1)].text self.surfer2_total = self.waves[g * 18 + (21 - 1)].text self.surfer2_w01 = self.waves[g * 18 + (22 - 1)].text self.surfer2_w02 = self.waves[g * 18 + (23 - 1)].text self.surfer2_w03 = self.waves[g * 18 + (24 - 1)].text self.surfer2_w04 = self.waves[g * 18 + (25 - 1)].text self.surfer2_w05 = self.waves[g * 18 + (26 - 1)].text self.surfer2_w06 = self.waves[g * 18 + (27 - 1)].text self.surfer2_w07 = self.waves[g * 18 + (28 - 1)].text self.surfer2_w08 = self.waves[g * 18 + (29 - 1)].text self.surfer2_w09 = self.waves[g * 18 + (30 - 1)].text self.surfer2_w10 = self.waves[g * 18 + (31 - 1)].text self.surfer2_w11 = self.waves[g * 18 + (32 - 1)].text self.surfer2_w12 = self.waves[g * 18 + (33 - 1)].text self.surfer2_w13 = self.waves[g * 18 + (34 - 1)].text self.surfer2_w14 = self.waves[g * 18 + (35 - 1)].text self.surfer2_w15 = self.waves[g * 18 + (36 - 1)].text #Inputing all variables into the heat_data list self.heat_data.append(self.heat_id) self.heat_data.append(self.event_name) self.heat_data.append(self.event_local) self.heat_data.append(self.event_turn) self.heat_data.append(self.event_period) self.heat_data.append(self.avg_wave_score) self.heat_data.append(self.surfer1) self.heat_data.append(self.surfer1_nat) self.heat_data.append(self.surfer1_best_w1) self.heat_data.append(self.surfer1_best_w2) self.heat_data.append(self.surfer1_total) self.heat_data.append(self.surfer1_w01) self.heat_data.append(self.surfer1_w02) self.heat_data.append(self.surfer1_w03) self.heat_data.append(self.surfer1_w04) self.heat_data.append(self.surfer1_w05) self.heat_data.append(self.surfer1_w06) self.heat_data.append(self.surfer1_w07) self.heat_data.append(self.surfer1_w08) self.heat_data.append(self.surfer1_w09) self.heat_data.append(self.surfer1_w10) self.heat_data.append(self.surfer1_w11) self.heat_data.append(self.surfer1_w12) self.heat_data.append(self.surfer1_w13) self.heat_data.append(self.surfer1_w14) self.heat_data.append(self.surfer1_w15) self.heat_data.append(self.surfer2) self.heat_data.append(self.surfer2_nat) self.heat_data.append(self.surfer2_best_w1) self.heat_data.append(self.surfer2_best_w2) self.heat_data.append(self.surfer2_total) self.heat_data.append(self.surfer2_w01) self.heat_data.append(self.surfer2_w02) self.heat_data.append(self.surfer2_w03) self.heat_data.append(self.surfer2_w04) self.heat_data.append(self.surfer2_w05) self.heat_data.append(self.surfer2_w06) self.heat_data.append(self.surfer2_w07) self.heat_data.append(self.surfer2_w08) self.heat_data.append(self.surfer2_w09) self.heat_data.append(self.surfer2_w10) self.heat_data.append(self.surfer2_w11) self.heat_data.append(self.surfer2_w12) self.heat_data.append(self.surfer2_w13) self.heat_data.append(self.surfer2_w14) self.heat_data.append(self.surfer2_w15) self.all_heats_lists.append(self.heat_data.copy()) self.heat_data.clear() #Click on the previous round botton print(self.all_heats_lists) try: self.prev_round_bt = self.driver.find_element_by_xpath( '//*[@class="flickity-button-icon"]').click() except: self.prev_round_bt = self.driver.find_element_by_xpath( '//*[@class="flickity-button-icon"]') self.driver.execute_script("arguments[0].scrollIntoView();", self.prev_round_bt) time.sleep(.5) self.prev_round_bt.click() time.sleep(2.5)
users = [] emails = [] passwords = [] options = EdgeOptions() options.use_chromium = True options.add_argument('headless') driver = Edge(executable_path='C:\Program Files\msedgedriver.exe', options=options) driver.get(url) time.sleep(5) lis = driver.find_elements_by_xpath( "/html/body/div/div[1]/main/article/div[2]/ol/li") for li in lis: emails.append(li.text) for email in emails: pwd = email.split('@')[0] pwd += '1234' pwd = hashlib.md5(pwd.encode()) passwords.append(pwd.hexdigest()) #------------------------------------------------------- client = pymongo.MongoClient( "mongodb+srv://sgdb:[email protected]/Music?retryWrites=true&w=majority" )
class Session: def __init__(self, username, password, sleep_time=2): self.username = username self.password = password self.sleep_time = sleep_time options = EdgeOptions() options.use_chromium = True self.driver = Edge(options=options) def login(self): self.driver.get("https://www.twitter.com/login") sleep(self.sleep_time) u_name = self.driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') u_name.send_keys(self.username) p_word = self.driver.find_element_by_xpath( '//input[@name="session[password]"]') p_word.send_keys(self.password) p_word.send_keys(Keys.RETURN) sleep(self.sleep_time) def tweet_selection(self, search_str, csv_tit, max_tweets=300): sleep(self.sleep_time) search_input = self.driver.find_element_by_xpath( '//input[@aria-label="Search query"]') search_input.clear() search_input.send_keys(search_str) search_input.send_keys(Keys.RETURN) sleep(self.sleep_time) data = [] tweet_ids = set() last_pos = self.driver.execute_script("return window.pageYOffset;") scrolling = True while scrolling: cards = self.driver.find_elements_by_xpath( '//div[@data-testid="tweet"]') for card in cards[-15:]: tweet = self.get_tweet_data(card) if tweet: tweet_id = ''.join(tweet) if tweet_id not in tweet_ids: tweet_ids.add(tweet_id) data.append(tweet) scroll_attempt = 0 while True: self.driver.execute_script( 'window.scrollTo(0, document.body.scrollHeight);') sleep(self.sleep_time) curr_pos = self.driver.execute_script( "return window.pageYOffset;") if last_pos == curr_pos: scroll_attempt += 1 if scroll_attempt >= 3: scrolling = False break else: sleep(2 * self.sleep_time) else: last_pos = curr_pos break with open(csv_tit, 'w', encoding="utf-8") as out: csv_out = csv.writer(out) csv_out.writerow([ 'user', 'date', 'text', 'quoting', 'reply count', 'retweet count', 'like count' ]) for row in data: csv_out.writerow(row) def get_tweet_data(self, card): user = card.find_element_by_xpath('.//span[contains(text(),"@")]').text try: date = card.find_element_by_xpath('.//time').get_attribute( 'datetime') except NoSuchElementException: return text = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text reply_count = card.find_element_by_xpath( './/div[@data-testid="reply"]').text retweet_count = card.find_element_by_xpath( './/div[@data-testid="retweet"]').text like_count = card.find_element_by_xpath( './/div[@data-testid="like"]').text tweet = (user, date, text, responding, reply_count, retweet_count, like_count) return tweet def tweet(self, tuit): # REQUIERE INTERACTUAR CON EDGE sleep(self.sleep_time) tuit_input = self.driver.find_element_by_xpath( '//div[@data-testid="tweetTextarea_0"]') tuit_input.clear() tuit_input.send_keys(tuit)
class TwitterBot(): def __init__(self): self.driver = Edge() self.driver.maximize_window() self.driver.get('https://twitter.com') self.driver.implicitly_wait(3) def goToTwitter(self): self.driver.get('https://twitter.com') def login(self): self.driver.find_element_by_xpath("//a[@href='/login']").click() #I used sleep because before this time there is another instance of an element named like below. #It is crucial to get the right element in order to interact with it. sleep(1) self.driver.find_element_by_xpath( "//input[@name='session[username_or_email]']").send_keys(username) self.driver.find_element_by_xpath( "//input[@name='session[password]']").send_keys(password) self.driver.find_element_by_xpath( "//div[@data-testid='LoginForm_Login_Button']").click() def basicSearch(self, topic): self.driver.find_element_by_xpath( "//input[@data-testid='SearchBox_Search_Input']").send_keys(topic) self.driver.find_element_by_xpath( "//input[@data-testid='SearchBox_Search_Input']").submit() def advancedSearch(self, exact, any, none, hashtags, dateFrom, dateTo): finalSearch = '' #This is to accommodate for different search types that a user might want. if exact != None: finalSearch += '"' + exact + '" ' if any != None: finalSearch += '(' + any + ') ' if none != None: finalSearch += '-' + none + ' ' if hashtags != None: finalSearch += '(#' + hashtags + ') ' if dateTo != None: finalSearch += 'until:' + dateTo + ' ' if dateFrom != None: finalSearch += 'since:' + dateFrom + ' ' self.driver.find_element_by_xpath( "//input[@data-testid='SearchBox_Search_Input']").send_keys( finalSearch) self.driver.find_element_by_xpath( "//input[@data-testid='SearchBox_Search_Input']").submit() def scrapeTweets(self, desiredNum): allLines = '' oldDataLines = [] dataLines = ['init'] tweetsFile = open('tweets.csv', 'w') #I included this array to help clean data later dirtyArray = [ 'Quote Tweet', 'Promoted', 'Show this thread', '', '\n', ' ' ] numDataLines = 0 while numDataLines < desiredNum and oldDataLines != dataLines: oldDataLines = dataLines sleep(1) #all these are different types of data that I do not want to pick up. dirtyData = self.driver.find_elements_by_xpath( "//div[@class='css-1dbjc4n r-1d09ksm r-18u37iz r-1wbh5a2']") dirtyData2 = self.driver.find_elements_by_xpath( "//div[@class = 'css-1dbjc4n r-18u37iz r-1wtj0ep r-156q2ks r-1mdbhws']" ) dirtyData3 = self.driver.find_elements_by_xpath( "//div[contains(text(),'Replying to')]") dirtyData4 = self.driver.find_elements_by_xpath( "//div[@role = 'blockquote']") #adding all the dirty data into one array for dirt in dirtyData2: dirtyData.append(dirt) for dirt in dirtyData3: dirtyData.append(dirt) for dirt in dirtyData4: dirtyData.append(dirt) #the data is stored with strings with many lines so I split the strings up by line and have an array where each index is one lin dirtyLines = [] for dirt in dirtyData: dirt = dirt.text chunks = dirt.split('\n') for chunk in chunks: dirtyLines.append(chunk) #this includes dirty data that will be weeded out later data = self.driver.find_elements_by_xpath( "//div[@data-testid='tweet']") #same thing I did with dirtyLines dataLines = [] for datapoint in data: datapoint = datapoint.text chunks = datapoint.split('\n') for chunk in chunks: dataLines.append(chunk) #I check oldDataLines as well to avoid redundancy for line in dataLines: if line not in dirtyLines and line not in oldDataLines and line not in dirtyArray: if numDataLines >= desiredNum: break try: noPunctuationLine = re.sub(r'[^\w\s]', '', line) tweetsFile.write(noPunctuationLine) tweetsFile.write("\n") allLines += line numDataLines += 1 except Exception: print('This data point not encodable.') height = self.driver.execute_script( "return document.documentElement.scrollHeight") self.driver.execute_script("window.scrollTo(0, " + str(height) + ");") tweetsFile.close() return allLines
def scrape(secure=False): options = EdgeOptions() options.use_chromium = True driver = Edge(options=options) query = input("▁ ▂ ▄ ▅ ▆ ▇ █ 𝐄𝐧𝐭𝐞𝐫 𝐭𝐡𝐞 𝐓𝐞𝐱𝐭 𝐭𝐨 𝐬𝐞𝐚𝐫𝐜𝐡 █ ▇ ▆ ▅ ▄ ▂ ▁\n\n ") print("\n𝘚𝘵𝘢𝘳𝘵𝘦𝘥 𝘚𝘤𝘳𝘢𝘱𝘪𝘯𝘨 ↦↦↦↦↦↦↦↦↦↦") print("\nPlease Wait ............\n") driver.get("https://www.twitter.com/login") driver.maximize_window() username = driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') username.send_keys("*****@*****.**") #password=getpass() userpas = driver.find_element_by_xpath( '//input[@name="session[password]"]') userpas.send_keys('-----') userpas.send_keys(Keys.RETURN) sleep(2) if secure: username = driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') username.send_keys("031-----") userpas = driver.find_element_by_xpath( '//input[@name="session[password]"]') userpas.send_keys('----') userpas.send_keys(Keys.RETURN) sleep(2) search = driver.find_element_by_xpath( '//input[@aria-label="Search query"]') search.send_keys('"پاک فوج" lang:ur -filter:links filter:replies') search.send_keys(Keys.RETURN) sleep(1.5) driver.find_element_by_link_text("Latest").click() data = [] tweet_ids = set() last_position = driver.execute_script("return window.pageYOffset;") scrolling = True while scrolling: posts = driver.find_elements_by_xpath('//div[@data-testid="tweet"]') for post in posts[-15:]: tweet = scrap_tweets(post) if tweet: tweet_id = "".join(tweet) if tweet_id not in tweet_ids: tweet_ids.add(tweet_id) data.append(tweet) scroll_attempt = 0 while True: driver.execute_script( "window.scrollTo(0,document.body.scrollHeight);") sleep(1) curr_position = driver.execute_script("return window.pageYOffset;") if last_position == curr_position: scroll_attempt += 1 if scroll_attempt >= 3: scrolling = False break else: sleep(2) else: last_position = curr_position break return data
def main(): args = sys.argv f = open(args[4], "r") Lines = f.readlines() names, profession, nationality, job = [], [], [], [] for line in Lines: array = line.split(",") names.append(array[0]) profession.append(array[1]) nationality.append(array[2]) job.append(array[3].replace("\n", "")) for name in names: print("Query:", name, ".\nProcessing...") user = '******' search_term = f'{name} filter:verified' options = EdgeOptions() options.use_chromium = True driver = Edge(options=options) driver.get('https://www.twitter.com/login') driver.maximize_window() sleep(2) username = driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') username.send_keys(user) password = driver.find_element_by_xpath( '//input[@name="session[password]"]') password.send_keys('donkey123') password.send_keys(Keys.RETURN) sleep(1) search_input = driver.find_element_by_xpath( '//input[@aria-label="Search query"]') search_input.send_keys(search_term) search_input.send_keys(Keys.RETURN) sleep(1) driver.find_element_by_link_text('People').click() sleep(3) driver.find_element_by_xpath( '//div[@class="css-1dbjc4n r-j7yic r-qklmqi r-1adg3ll r-1ny4l3l"]' ).click() sleep(3) data = [] tweet_data = [] start = 0 end = 500 for i in range(0, 5): sleep(1) cards = driver.find_elements_by_xpath( '//div[@data-testid="tweet"]') card = cards[i] tweet = get_tweet_data(card) for card in cards: data = get_tweet_data(card) if data: tweet_data.append(data) driver.execute_script(f'window.scrollTo({start},{end});') start += 500 end += 500 driver.close() tweets = set(tweet_data) write_to_csv(name, tweets) df = pd.read_csv(f'{name}.csv') Twitter_sentiment = Twitter_sentiment_model(df) Twitter_toxic = Twitter_toxic_model(df) Big5 = Big5_model(df) create_report(name, tweets, Twitter_sentiment, Twitter_toxic, Big5)
class Sei: __area_inicial = None __windows_before = 0 __windows_after = 0 def __init__(self, headless=False, executable_path='chromedriver'): if 'chromedriver' in executable_path: chrome_options = Options() chrome_options.add_argument('--enable-javascript') chrome_options.add_argument('--window-size=1440,900') chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--proxy-server='direct://'") chrome_options.add_argument("--proxy-bypass-list=*") chrome_options.add_argument("--start-maximized") chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--ignore-certificate-errors') if headless: chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(executable_path=executable_path, options=chrome_options) elif 'msedgedriver' in executable_path: edge_options = EdgeOptions() edge_options.use_chromium = True edge_options.add_argument('enable-javascript') edge_options.add_argument('window-size=1440,900') edge_options.add_argument("disable-extensions") edge_options.add_argument("proxy-server='direct://'") edge_options.add_argument("proxy-bypass-list=*") edge_options.add_argument("start-maximized") edge_options.add_argument('disable-dev-shm-usage') edge_options.add_argument('no-sandbox') edge_options.add_argument('ignore-certificate-errors') if headless: edge_options.add_argument('headless') edge_options.add_argument('disable-gpu') self.driver = Edge(executable_path=executable_path, options=edge_options) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def start_driver(self, url, usuario=None, senha=None): if usuario == None: usuario = input('Digite o usuário: ') if senha == None: senha = getpass('Digite a senha: ') self.driver.get(url) usuario_field = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "txtUsuario"))) senha_field = self.driver.find_element_by_id('pwdSenha') botao_acessar = self.driver.find_element_by_id('sbmLogin') usuario_field.clear() usuario_field.send_keys(usuario) senha_field.clear() senha_field.send_keys(senha) botao_acessar.click() alerta = self.fechar_alerta() if alerta: raise Exception(alerta) # usuário ou senha inválido self.__area_incial = self.get_area() def go_to(self, numero_sei): if self.__windows_after > self.__windows_before: self.driver.close() self.driver.switch_to.window( self.driver.window_handles[self.__windows_before - 1]) self.driver.switch_to.default_content() pesquisa = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "txtPesquisaRapida"))) pesquisa.clear() pesquisa.send_keys(str(numero_sei)) formPesquisaRapida = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.ID, "frmProtocoloPesquisaRapida"))) self.__windows_before = len(self.driver.window_handles) formPesquisaRapida.submit() self.__windows_after = len(self.driver.window_handles) if self.__windows_after > self.__windows_before: self.driver.switch_to.window( self.driver.window_handles[self.__windows_after - 1]) def is_processo_aberto(self, area=None, processo=None): if processo: self.go_to(processo) else: self.driver.switch_to.default_content() try: ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) informacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "divInformacao"))) mensagem = informacao.text aberto = 'aberto' in mensagem if area: regex = '(?im)^(.*)(' + area + ')[^0-9a-z](.*)$' matches = search(regex, mensagem) if matches: aberto = True else: aberto = False self.driver.switch_to.default_content() except: aberto = None mensagem = 'Impossível abrir mensagem do processo' return aberto, mensagem def get_processo_anexador(self, processo=None): if processo: self.go_to(processo) else: self.driver.switch_to.default_content() ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) informacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "divInformacao"))) procAnex = None if 'Processo anexado ao processo' in informacao.text: processoAnexador = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.XPATH, "//*[@id=\"divInformacao\"]/div/a"))) procAnex = processoAnexador.text self.driver.switch_to.default_content() return procAnex def get_area(self): self.driver.switch_to.default_content() select = Select(self.driver.find_element_by_id('selInfraUnidades')) return select.all_selected_options[0].text def seleciona_area(self, area): self.driver.switch_to.default_content() select = Select(self.driver.find_element_by_id('selInfraUnidades')) all_selected_options = select.all_selected_options for option in all_selected_options: if area == option.text: return True select = Select(self.driver.find_element_by_id('selInfraUnidades')) options = select.options for option in options: if area == option.text: select.select_by_visible_text(area) Select( WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.ID, 'selInfraUnidades')))) return True return False def clicar_botao(self, botao): self.driver.switch_to.default_content() ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) arvore = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "divArvoreAcoes"))) botoes = arvore.find_elements(By.XPATH, '//*[@id=\"divArvoreAcoes\"]/a') for b in botoes: img = b.find_element(By.XPATH, 'img') if botao in img.get_attribute('title'): b.click() try: WebDriverWait(self.driver, 1).until( EC.alert_is_present(), 'Timed out waiting for PA creation ' + 'confirmation popup to appear.') except: try: self.driver.switch_to.default_content() except: None return True return False def fechar_alerta(self): alerta = None try: WebDriverWait(self.driver, 3).until( EC.alert_is_present(), 'Timed out waiting for PA creation ' + 'confirmation popup to appear.') alert = self.driver.switch_to.alert alerta = alert.text alert.accept() self.driver.switch_to.default_content() except TimeoutException: None return alerta def is_sobrestado(self, area=None, processo=None): if processo: self.go_to(processo) else: self.driver.switch_to.default_content() ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) informacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "divInformacao"))) sobrestado = 'sobrestado' in informacao.text mensagem = informacao.text self.driver.switch_to.default_content() if area: regex = '(?im)^(.*)(' + area + ')[^0-9a-z](.*)$' matches = search(regex, informacao.text) return sobrestado, matches != None else: return sobrestado, mensagem def sobrestar_processo(self, motivo, processo=None): if processo: self.go_to(processo) else: self.driver.switch_to.default_content() if self.clicar_botao('Sobrestar Processo'): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) self.driver.find_element(By.ID, 'divOptSomenteSobrestar').click() motivoField = self.driver.find_element(By.ID, 'txaMotivo') motivoField.clear() motivoField.send_keys(motivo) self.driver.find_element(By.ID, 'sbmSalvar').click() self.driver.switch_to.default_content() return True return False def remover_sobrestamento(self, processo=None): if processo: self.go_to(processo) if self.clicar_botao('Remover Sobrestamento do Processo'): self.fechar_alerta() return True return False def publicar(self, resumo_ementa, data_disponibilizacao, documento=None, dou=False, secao=None, pagina=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() if self.clicar_botao('Agendar Publicação'): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) resumo_ementa_text_field = self.driver.find_element( By.ID, 'txaResumo') resumo_ementa_text_field.clear() resumo_ementa_text_field.send_keys(resumo_ementa) disponibilizacao = self.driver.find_element( By.ID, 'txtDisponibilizacao') disponibilizacao.clear() disponibilizacao.send_keys(data_disponibilizacao) if dou: select = Select(self.driver.find_element_by_id('selVeiculoIO')) select.select_by_visible_text('DOU') select = Select( WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "selSecaoIO")))) WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "option[value='" + secao if secao else '3' + "']"))) select.select_by_visible_text(secao if secao else '3') pagina_text_field = self.driver.find_element( By.ID, 'txtPaginaIO') pagina_text_field.clear() pagina_text_field.send_keys(pagina if pagina else '') disponibilizacao = self.driver.find_element(By.ID, 'txtDataIO') disponibilizacao.clear() disponibilizacao.send_keys(data_disponibilizacao) self.driver.find_element_by_id('btnSalvar').click() self.driver.switch_to.default_content() return True return False def get_conteudo_documento(self, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) documento_conteudo = self.driver.find_element_by_xpath( '/html/body').get_attribute('innerHTML') documento_conteudo = sub( r'\\n', '', documento_conteudo) # retirar quebra de páginas documento_conteudo = sub(r'\s\s+?', ' ', documento_conteudo) # tira espaços duplos documento_conteudo = sub(r' ', ' ', documento_conteudo) # tira espaços duplos documento_conteudo = documento_conteudo.strip( ) # retirar quebras de páginas que tenham restado return documento_conteudo except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def get_documento_element_by_id(self, id, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: if (self.__windows_after == self.__windows_before): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) return self.driver.find_element_by_id(id).text except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def get_documento_elements_by_id(self, id, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: if (self.__windows_after == self.__windows_before): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) elements = self.driver.find_elements_by_id(id) return [element.text for element in elements] except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def get_documento_element_by_xpath(self, xpath, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: if (self.__windows_after == self.__windows_before): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) return self.driver.find_element_by_xpath(xpath).text except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def get_documento_elements_by_xpath(self, xpath, documento=None): if documento: self.go_to(documento) else: self.driver.switch_to.default_content() try: if (self.__windows_after == self.__windows_before): ifrVisualizacao = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrVisualizacao"))) self.driver.switch_to.frame(ifrVisualizacao) ifrArvoreHtml = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located((By.ID, "ifrArvoreHtml"))) self.driver.switch_to.frame(ifrArvoreHtml) elements = self.driver.find_elements_by_xpath(xpath) return [element.text for element in elements] except: raise Exception('Conteúdo do documento %s não encontrado.' % documento) finally: self.driver.switch_to.default_content() def close(self, voltar=True): if voltar: self.seleciona_area(self.__area_incial) self.driver.close() self.driver.quit()