def chinahpo(hpo): # 如果使用IP池,则不进行随机等待 # s = random.randint(5, 10) # print("等待 " + str(s) + "秒") # time.sleep(s) ip = randomIP() # ip = "socks5://127.0.0.1:1080" print("使用IP " + ip) options = EdgeOptions() options.use_chromium = True options.add_argument("headless") # options.add_argument("disable-gpu") options.add_argument("--proxy-server={ip}".format(ip=ip)) options.add_argument("--disable-blink-features") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("start-maximized") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe" driver = Edge(options=options, executable_path=msedge) script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" driver.execute_script(script) UA = randomUA() # UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36" driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": UA}) print(driver.execute_script("return navigator.userAgent;")) hpid = hpo.split(":")[1] url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format( hpid=hpid) try: driver.get(url) strtemp = url print("网址:", strtemp) except Exception: print("get page error", hpo) time.sleep(2) with open("html2/hp_" + hpid + ".html", "a+", encoding="utf-8") as f: f.write(str(driver.page_source)) driver.close() fin = open("finish.txt", "a") fin.write(hpo + "\n") fin.close()
def img_download(url, path, count): # get all images chrome_driver_path = "D:\\programming\\Machine learning\\ml_projects\\google image scraper\\msedgedriver.exe" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge Beta\\Application\\msedge.exe" option = EdgeOptions() option.binary_location = browser_path driver = Edge(executable_path=chrome_driver_path, options=option) try: driver.get(url) #time.sleep(10) for __ in range(10): driver.execute_script("window.scrollBy(0, 1000000)") time.sleep(.2) imgs = get_all_images(url, driver) for img in imgs: # for each img, download it count = download(img, path, count) except WebDriverException: print("page down") return count
def wa_login(isHeadless=True): ''' Use to login to Whatsapp Web Can omit usage if already logged in once by scanning QR Parameters ---------- None Returns ------- None ''' options = EdgeOptions() options.use_chromium = True #Uses chromium-based edgium, remove to use legacy edge options.add_argument("user-data-dir="+os.getcwd()+"\\Cache") options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.49") options.add_experimental_option('excludeSwitches', ['enable-logging']) # options.add_experimental_option("excludeSwitches", ["enable-automation"]) # options.add_experimental_option("useAutomationExtension", False) options.headless = isHeadless #Headless mode global driver driver = Edge(EdgeChromiumDriverManager().install(),options=options) driver.get('https://web.whatsapp.com/') if os.path.isfile('./Cache/wa.exists'): return else: pass wait_for_load('_1PTz1') driver.execute_script(""" var element1 = document.querySelector("._3DgtU"); var element2 = document.querySelector("._1iKcN"); if (element1) element1.parentNode.removeChild(element1); if (element2) element2.parentNode.removeChild(element2); """) Image.open(BytesIO(driver.find_element_by_class_name('landing-main').screenshot_as_png)).show() with open('Cache/wa.exists','w') as file: pass
class Web_scraping: def __init__(self): '''Initialize the application''' #As using the standard webdriver was giving warnings and messing up the terminal, I used the code below to show just what I want. self.opt = EdgeOptions() self.opt.add_experimental_option('excludeSwitches', ['enable-logging']) self.opt.add_argument("--start-maximized") self.opt.use_chromium = True self.driver = Edge( executable_path= r"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe", options=self.opt) def games_link(self): '''Create a list with all season event's link and then create another list with all event's link''' #Creating list with the all season's link self.season_pages_list = [] for y in range(2008, 2022): #Creating the seasons links as str and adding it to a list self.season_link = 'https://www.worldsurfleague.com/events/' + str( y) + '/mct?all=1' self.season_pages_list.append(self.season_link) #Creating a list with the all event's link from each season self.events_link_list = [] for link in self.season_pages_list: self.driver.get(link) #Getting all the events links as selenium format self.event_links = self.driver.find_elements_by_xpath( '//a[@class="event-schedule-details__event-name"]') #Finding the class status completed is needed once it's possible to stop the process on it. self.event_status = self.driver.find_elements_by_xpath( '//span[@class="event-status event-status--completed"]') #Creating event's link list for i in range(0, len(self.event_status)): #Getting the links for each event as a str format self.link_attribute = self.event_links[i].get_attribute('href') self.events_link_list.append(self.link_attribute) with open('events.txt', 'w') as f: for item in self.events_link_list: f.write("%s\n" % item) print('FINISHED') #Getting data inside which event def event_stats(self): #TXT file with all events link to list self.events_link = [ line[0] for line in pd.read_fwf('events.txt', header=None).values.tolist() ] #for link in self.events_link: self.driver.get(self.events_link[0]) #list of all heats self.all_heats_lists = [] while True: #Gets all the waves scores, athletes, nationalities and heats on the page as list. self.waves = self.driver.find_elements_by_xpath( '//*[@class="score"]') self.athletes = self.driver.find_elements_by_xpath( '//*[@class="athlete-name"]') self.nationalities = self.driver.find_elements_by_xpath( '//*[@class="athlete-country-flag"]') self.heat = self.driver.find_elements_by_xpath( '//*[@class="new-heat-hd-name"]') #Gets the round name self.round = self.driver.find_elements_by_xpath( '//*[@class="carousel-item is-selected"]') if len(self.round) == 0: self.round = self.driver.find_elements_by_xpath( '//*[@class="carousel-item last is-selected"]') #Gets the number of surfers and heats on the round, such as the avg surfers per heat (must be 2 or 3) self.number_of_surfers = int(len(self.waves) / 18) #As the final round only has 1 heat, the find_element_by_class_name gets a 'WebDriver' element and not a list self.number_of_heats = len(self.heat) self.surfers_per_heat = int(self.number_of_surfers / self.number_of_heats) #there's a count to deduct 1 stage and gets the round name for each round. self.count = 0 #Gets the stats for each heat self.heat_data = [] for g in range(0, self.number_of_heats): #Page stats #Event stats self.event_turn = self.driver.find_element_by_class_name( 'event-meta-tour-info').text.split()[2][1:] self.event_period = self.driver.find_element_by_class_name( 'event-schedule__date-range').text self.event_name = self.driver.find_element_by_class_name( 'event-title').text.split('\n')[0] self.event_local = re.split( r'(\d+)', self.driver.find_element_by_class_name( 'event-meta-tour-info').text)[2] self.avg_wave_score = re.split( r'(\d+\.\d+)', self.driver.find_element_by_class_name( 'new-heat-hd-status').text)[1] #Heat's id for the database self.heat_id = (f'heat{g + 1}' + self.round[0].text + self.event_turn + self.event_period[-4:]).lower() #Surfer stats self.surfer1 = self.athletes[g * 2].text self.surfer1_nat = self.nationalities[g * 2].get_attribute('title') self.surfer1_best_w1 = self.waves[g * 18 + (1 - 1)].text self.surfer1_best_w2 = self.waves[g * 18 + (2 - 1)].text self.surfer1_total = self.waves[g * 18 + (3 - 1)].text self.surfer1_w01 = self.waves[g * 18 + (4 - 1)].text self.surfer1_w02 = self.waves[g * 18 + (5 - 1)].text self.surfer1_w03 = self.waves[g * 18 + (6 - 1)].text self.surfer1_w04 = self.waves[g * 18 + (7 - 1)].text self.surfer1_w05 = self.waves[g * 18 + (8 - 1)].text self.surfer1_w06 = self.waves[g * 18 + (9 - 1)].text self.surfer1_w07 = self.waves[g * 18 + (10 - 1)].text self.surfer1_w08 = self.waves[g * 18 + (11 - 1)].text self.surfer1_w09 = self.waves[g * 18 + (12 - 1)].text self.surfer1_w10 = self.waves[g * 18 + (13 - 1)].text self.surfer1_w11 = self.waves[g * 18 + (14 - 1)].text self.surfer1_w12 = self.waves[g * 18 + (15 - 1)].text self.surfer1_w13 = self.waves[g * 18 + (16 - 1)].text self.surfer1_w14 = self.waves[g * 18 + (17 - 1)].text self.surfer1_w15 = self.waves[g * 18 + (18 - 1)].text #Surfer 2 stats self.surfer2 = self.athletes[g * 2 + 1].text self.surfer2_nat = self.nationalities[g * 2 + 1].get_attribute('title') self.surfer2_best_w1 = self.waves[g * 18 + (19 - 1)].text self.surfer2_best_w2 = self.waves[g * 18 + (20 - 1)].text self.surfer2_total = self.waves[g * 18 + (21 - 1)].text self.surfer2_w01 = self.waves[g * 18 + (22 - 1)].text self.surfer2_w02 = self.waves[g * 18 + (23 - 1)].text self.surfer2_w03 = self.waves[g * 18 + (24 - 1)].text self.surfer2_w04 = self.waves[g * 18 + (25 - 1)].text self.surfer2_w05 = self.waves[g * 18 + (26 - 1)].text self.surfer2_w06 = self.waves[g * 18 + (27 - 1)].text self.surfer2_w07 = self.waves[g * 18 + (28 - 1)].text self.surfer2_w08 = self.waves[g * 18 + (29 - 1)].text self.surfer2_w09 = self.waves[g * 18 + (30 - 1)].text self.surfer2_w10 = self.waves[g * 18 + (31 - 1)].text self.surfer2_w11 = self.waves[g * 18 + (32 - 1)].text self.surfer2_w12 = self.waves[g * 18 + (33 - 1)].text self.surfer2_w13 = self.waves[g * 18 + (34 - 1)].text self.surfer2_w14 = self.waves[g * 18 + (35 - 1)].text self.surfer2_w15 = self.waves[g * 18 + (36 - 1)].text #Inputing all variables into the heat_data list self.heat_data.append(self.heat_id) self.heat_data.append(self.event_name) self.heat_data.append(self.event_local) self.heat_data.append(self.event_turn) self.heat_data.append(self.event_period) self.heat_data.append(self.avg_wave_score) self.heat_data.append(self.surfer1) self.heat_data.append(self.surfer1_nat) self.heat_data.append(self.surfer1_best_w1) self.heat_data.append(self.surfer1_best_w2) self.heat_data.append(self.surfer1_total) self.heat_data.append(self.surfer1_w01) self.heat_data.append(self.surfer1_w02) self.heat_data.append(self.surfer1_w03) self.heat_data.append(self.surfer1_w04) self.heat_data.append(self.surfer1_w05) self.heat_data.append(self.surfer1_w06) self.heat_data.append(self.surfer1_w07) self.heat_data.append(self.surfer1_w08) self.heat_data.append(self.surfer1_w09) self.heat_data.append(self.surfer1_w10) self.heat_data.append(self.surfer1_w11) self.heat_data.append(self.surfer1_w12) self.heat_data.append(self.surfer1_w13) self.heat_data.append(self.surfer1_w14) self.heat_data.append(self.surfer1_w15) self.heat_data.append(self.surfer2) self.heat_data.append(self.surfer2_nat) self.heat_data.append(self.surfer2_best_w1) self.heat_data.append(self.surfer2_best_w2) self.heat_data.append(self.surfer2_total) self.heat_data.append(self.surfer2_w01) self.heat_data.append(self.surfer2_w02) self.heat_data.append(self.surfer2_w03) self.heat_data.append(self.surfer2_w04) self.heat_data.append(self.surfer2_w05) self.heat_data.append(self.surfer2_w06) self.heat_data.append(self.surfer2_w07) self.heat_data.append(self.surfer2_w08) self.heat_data.append(self.surfer2_w09) self.heat_data.append(self.surfer2_w10) self.heat_data.append(self.surfer2_w11) self.heat_data.append(self.surfer2_w12) self.heat_data.append(self.surfer2_w13) self.heat_data.append(self.surfer2_w14) self.heat_data.append(self.surfer2_w15) self.all_heats_lists.append(self.heat_data.copy()) self.heat_data.clear() #Click on the previous round botton print(self.all_heats_lists) try: self.prev_round_bt = self.driver.find_element_by_xpath( '//*[@class="flickity-button-icon"]').click() except: self.prev_round_bt = self.driver.find_element_by_xpath( '//*[@class="flickity-button-icon"]') self.driver.execute_script("arguments[0].scrollIntoView();", self.prev_round_bt) time.sleep(.5) self.prev_round_bt.click() time.sleep(2.5)
class Session: def __init__(self, username, password, sleep_time=2): self.username = username self.password = password self.sleep_time = sleep_time options = EdgeOptions() options.use_chromium = True self.driver = Edge(options=options) def login(self): self.driver.get("https://www.twitter.com/login") sleep(self.sleep_time) u_name = self.driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') u_name.send_keys(self.username) p_word = self.driver.find_element_by_xpath( '//input[@name="session[password]"]') p_word.send_keys(self.password) p_word.send_keys(Keys.RETURN) sleep(self.sleep_time) def tweet_selection(self, search_str, csv_tit, max_tweets=300): sleep(self.sleep_time) search_input = self.driver.find_element_by_xpath( '//input[@aria-label="Search query"]') search_input.clear() search_input.send_keys(search_str) search_input.send_keys(Keys.RETURN) sleep(self.sleep_time) data = [] tweet_ids = set() last_pos = self.driver.execute_script("return window.pageYOffset;") scrolling = True while scrolling: cards = self.driver.find_elements_by_xpath( '//div[@data-testid="tweet"]') for card in cards[-15:]: tweet = self.get_tweet_data(card) if tweet: tweet_id = ''.join(tweet) if tweet_id not in tweet_ids: tweet_ids.add(tweet_id) data.append(tweet) scroll_attempt = 0 while True: self.driver.execute_script( 'window.scrollTo(0, document.body.scrollHeight);') sleep(self.sleep_time) curr_pos = self.driver.execute_script( "return window.pageYOffset;") if last_pos == curr_pos: scroll_attempt += 1 if scroll_attempt >= 3: scrolling = False break else: sleep(2 * self.sleep_time) else: last_pos = curr_pos break with open(csv_tit, 'w', encoding="utf-8") as out: csv_out = csv.writer(out) csv_out.writerow([ 'user', 'date', 'text', 'quoting', 'reply count', 'retweet count', 'like count' ]) for row in data: csv_out.writerow(row) def get_tweet_data(self, card): user = card.find_element_by_xpath('.//span[contains(text(),"@")]').text try: date = card.find_element_by_xpath('.//time').get_attribute( 'datetime') except NoSuchElementException: return text = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text reply_count = card.find_element_by_xpath( './/div[@data-testid="reply"]').text retweet_count = card.find_element_by_xpath( './/div[@data-testid="retweet"]').text like_count = card.find_element_by_xpath( './/div[@data-testid="like"]').text tweet = (user, date, text, responding, reply_count, retweet_count, like_count) return tweet def tweet(self, tuit): # REQUIERE INTERACTUAR CON EDGE sleep(self.sleep_time) tuit_input = self.driver.find_element_by_xpath( '//div[@data-testid="tweetTextarea_0"]') tuit_input.clear() tuit_input.send_keys(tuit)
class Nvidia(): def __init__(self): self.api = 'https://api-prod.nvidia.com/direct-sales-shop/DR/products/en_us/USD/5438481700' #nvidia-api self.debug = 'https://jsonplaceholder.typicode.com/todos/2' self.options = EdgeOptions() self.options.use_chromium = True self.options.add_argument( "--user-data-dir=C:\\Users\\Justin\\AppData\\Local\\Microsoft\\Edge\\User Data\\Profile 1" ) #Path to your chrome profile self.script = '''javascript:store.dispatch({type: actionTypes.ADD_ITEM_TO_CART,id: 5438481700,quantity: 1}); document.getElementsByClassName('nv-button js-checkout cart__checkout-button')[0].click();''' self.driver = Edge(executable_path=os.getcwd() + '/msedgedriver.exe') print( f'\033[91m{datetime.now().strftime("%H:%M:%S")}\033[00m \033[94m[Browser]\033[00m Browser is open!' ) self.wait = WebDriverWait(self.driver, 10) self.test() def test(self): print( f'\033[91m{datetime.now().strftime("%H:%M:%S")}\033[00m \033[94m[Browser]\033[00m Going to NVIDIA' ) self.driver.get( "https://www.nvidia.com/en-us/geforce/graphics-cards/30-series/rtx-3080/" ) self.makeRequest() def purchaseScript(self): try: self.driver.execute_script(self.script) time.sleep(2) self.driver.get( 'https://store.nvidia.com/store?Action=DisplayPage&Locale=en_US&SiteID=nvidia&id=QuickBuyCartPage' ) print( f'{datetime.now().strftime("%H:%M:%S")} \033[92m[Status]\033[00m Bought' ) except: print( f'{datetime.now().strftime("%H:%M:%S")} \033[92m[Status]\033[00m JavaScript Error' ) def makeRequest(self): while (1): r = requests.get(self.api) #CHANGE TO API if r.status_code != 200: print( f'\033[91m{datetime.now().strftime("%H:%M:%S")}\033[00m \033[92m[Status]\033[00m \033[96mAPI Cooldown.... waiting 30 seconds\033[00m' ) time.sleep(30) else: if r.json()["InventoryStatus"][ "status"] == 'PRODUCT_INVENTORY_OUT_OF_STOCK': print( f'\033[91m{datetime.now().strftime("%H:%M:%S")}\033[00m \033[92m[Status]\033[00m \033[91mOUT OF STOCK\033[00m' ) time.sleep(1.5) else: self.purchaseScript() break
class ChannelScrape: """ Constructors: __init__() Methods: toFile(), getUpcomingId(), getLiveId() """ options_edge = EdgeOptions() options_edge.use_chromium = True options_edge.add_argument('--ignore-certificate-errors') options_edge.add_argument('--ignore-ssl-errors') options_edge.add_argument('--mute-audio') def __init__(self, channelId: str, headless=True, executable_path=None): # Searches for webdriver on each dir from PATH environment variables # Currently untested in linux if executable_path == None: for p in os.environ['PATH'].split(";"): if os.path.isfile(p + "\msedgedriver.exe"): self.path_dir = p + "\msedgedriver.exe" # Setup driver self.options_edge.headless = headless self.driver = Edge(options=self.options_edge, executable_path=self.path_dir) # JSON collecting process url = 'https://www.youtube.com/channel/' + channelId self.driver.get(url) self.jsonData = self.driver.execute_script('return ytInitialData') self.driver.quit() def toFile(self, output_file: str): """ Output the collected json data to a file output_file: Output file name. File extension will be added automatically """ with codecs.open(output_file + '.json', 'w', encoding='utf-8') as jsonFile: json.dump(self.jsonData, jsonFile, ensure_ascii=False, indent=1) def getUpcomingId(self, dayDelta=14): """ Returns a list of upcoming livestream(s) video ID dayDelta: If the upcoming livestream delta is more than the provided argument, the livestream Id will not be added to the return list """ # Personal note: # The base for calculating dates is 31-12-1969 (UNIX epoch time) # Which is then counted to the used date by seconds dateFilter = timedelta(days=dayDelta) dateThreshold = datetime.now() + dateFilter collectedContents = [] try: content = self.jsonData['contents'][ 'twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer'][ 'content']['sectionListRenderer']['contents'][1][ 'itemSectionRenderer']['contents'][0]['shelfRenderer'][ 'content'] except: print( 'Index out of range (Most likely channel only have horizontal grid renderer)' ) return collectedContents # Only one upcoming livestream # This shouldn't need to use for loop assuming that there is always one item in items key # But items is still an array, so just in case if "expandedShelfContentsRenderer" in content: for item in content['expandedShelfContentsRenderer']['items']: liveDateEpoch = int( item['videoRenderer']['upcomingEventData']['startTime']) liveDate = datetime.fromtimestamp(mktime( gmtime(liveDateEpoch))) if item['videoRenderer']['thumbnailOverlays'][0][ 'thumbnailOverlayTimeStatusRenderer'][ 'style'] == "UPCOMING" and liveDate < dateThreshold: collectedContents.append(item['videoRenderer']['videoId']) # Multiple upcoming livestreams elif "horizontalListRenderer" in content: for item in content['horizontalListRenderer']['items']: if 'upcomingEventData' in item['gridVideoRenderer']: liveDateEpoch = int(item['gridVideoRenderer'] ['upcomingEventData']['startTime']) liveDate = datetime.fromtimestamp( mktime(gmtime(liveDateEpoch))) if item['gridVideoRenderer']['thumbnailOverlays'][0][ 'thumbnailOverlayTimeStatusRenderer'][ 'style'] == "UPCOMING" and liveDate < dateThreshold: collectedContents.append( item['gridVideoRenderer']['videoId']) return collectedContents def getLiveId(self): # Returns a list of the current livestreams video Id, if any # It is unlikely that there are multiple livestreams in the same channel, # but the possibility is there, therefore it returns a list instead of a single item content = self.jsonData['contents']['twoColumnBrowseResultsRenderer'][ 'tabs'][0]['tabRenderer']['content']['sectionListRenderer'][ 'contents'][0]['itemSectionRenderer']['contents'][0] collectedContents = [] if "channelFeaturedContentRenderer" in content: for videoItem in content['channelFeaturedContentRenderer'][ 'items']: if videoItem['videoRenderer']['thumbnailOverlays'][0][ 'thumbnailOverlayTimeStatusRenderer'][ 'style'] == "LIVE": collectedContents.append( videoItem['videoRenderer']['videoId']) return collectedContents
def chooseAccount(): with open('data.txt') as json_file: data = json.load(json_file) userInfo ='account: ' + data['username'] print(userInfo) userName = data['username'] passWord = data['password'] print("link:") link = input() print("number of photos: ") amount = input() # format text and amount amount = int(amount) # auto login options = EdgeOptions() options.use_chromium = True options.add_argument('headless') driver = Edge('msedgedriver', options = options) driver.get(link) time.sleep(2) userForm = driver.find_element_by_css_selector("input[name='username']") passForm = driver.find_element_by_css_selector("input[name='password']") userForm.send_keys(userName) passForm.send_keys(passWord) driver.find_element_by_css_selector("button[type='submit']").click() time.sleep(3) driver.execute_script("document.querySelector('.sqdOP.yWX7d.y3zKF').click()") # get link image to list time.sleep(2) if amount > 1: spriteBtn = driver.find_element_by_css_selector(".coreSpriteRightChevron") list_link = [] def get_url1(): list_element = driver.find_elements_by_css_selector("img[style='object-fit: cover;']") for image in list_element[:1]: src = image.get_attribute("src") list_link.append(src) def get_url2(): list_element = driver.find_elements_by_css_selector("img[style='object-fit: cover;']") list_element.pop(0) for image in list_element[:1]: src = image.get_attribute("src") list_link.append(src) for x in range(0, amount+1): if (len(list_link) > 0): get_url2() else: get_url1() if len(list_link) == amount: break elif spriteBtn: spriteBtn.click() else: break time.sleep(0.5) # check old image folder exist if (os.path.isdir("./image")): rmtree("./image") # create new image folder folderPath = os.getcwd() folderPath += '\image' os.mkdir(folderPath) # clear screen clear = lambda: os.system('cls') clear() for i in tqdm(range(100)): pass print("\nnumber of photos:", len(list_link)) pos = 0 for href in list_link: print(pos+1, "DONE") imagePathResult = "./image/image_" + str(pos) + ".png" try: downloadFile(href) copy("./image/image.png", imagePathResult) except: print("error at %s" %pos+1) pos += 1 os.remove("./image/image.png") resultPath = os.getcwd() resultPath = resultPath + '\image' os.startfile(resultPath) driver.close() chooseMenu() if (os.path.isfile(path)): key = 2 else: key = 1 menu(key)
class QCourse: def __init__(self): # 初始化options self.prefs = {"download.default_directory": os.getcwd()} self.options = EdgeOptions() self.options.use_chromium = True self.options.add_argument("log-level=3") self.options.add_experimental_option('excludeSwitches', ['enable-logging']) self.options.add_experimental_option('prefs', self.prefs) self.options.add_argument("--mute-audio") self.login_url = 'https://ke.qq.com/' self.driver = Edge(executable_path='msedgedriver.exe', options=self.options) def login(self): self.driver.get('https://ke.qq.com/') self.driver.find_element_by_id('js_login').click() time.sleep(1) WebDriverWait(self.driver, 300).until_not( EC.presence_of_element_located((By.CLASS_NAME, 'ptlogin-mask'))) dictCookies = self.driver.get_cookies() jsonCookies = json.dumps(dictCookies) with open('cookies.json', 'w') as f: f.write(jsonCookies) print('登陆成功!') def close(self): self.driver.close() def get_video(self, video_url=None, path=None): if not video_url: print('请输入视频url!') # os.chdir(BASE_DIR) if not os.path.exists('cookies.json'): self.login() with open('cookies.json', 'r') as f: listCookies = json.loads(f.read()) self.driver.get(video_url) for cookie in listCookies: self.driver.add_cookie({ 'domain': '.ke.qq.com', 'httpOnly': cookie['httpOnly'], 'name': cookie['name'], 'path': '/', 'secure': cookie['secure'], 'value': cookie['value'] }) self.driver.get(video_url) # 等待视频开始播放 WebDriverWait(self.driver, 300).until( EC.presence_of_element_located((By.CLASS_NAME, 'loki-time'))) WebDriverWait( self.driver, 300).until_not(lambda driver: driver.find_element_by_class_name( 'loki-time').get_attribute("innerHTML") == '00:00 / 00:00') networks = self.driver.execute_script( 'return window.performance.getEntries()') ts_url = key_url = '' for network in networks: if '.ts?start' in network.get('name'): ts_url = network.get('name') elif 'get_dk' in network.get('name'): key_url = network.get('name') title = self.driver.title # catalog = self.driver.execute_script('return document.getElementsByClassName("task-item task-info active")' # '[0].parentNode.firstElementChild.innerText') # os.chdir(os.path.join(os.getcwd(), catalog)) download_single(ts_url, key_url, title, path)
pingms=round(pingms,2)+1 except: pingms=2 print("-Calculated action delay: "+str(pingms)) # Maximize window for no real reason browser.maximize_window() for i in range(int(nb)): passed=passed+1 try: # Open browser. if firstTime == True: browser.get("https://kahoot.it/") else: browser.execute_script("window.open('');") browser.switch_to.window(browser.window_handles[total]) browser.get("https://kahoot.it/") #if firstTime == False: # wait=WebDriverWait(browser, 3) # alert=wait.until(EC.alert_is_present()) # alert.accept() time.sleep((pingms/2)) # Find game id element and enter game code. search=browser.find_element_by_name("gameId") search.click() search.send_keys(qp) search.send_keys(Keys.RETURN) print("-Joined Game") print("-Entering name option") # Wait for browser to catch up. Edit equation later.
def chinahpo(hpo_queue): while hpo_queue.empty() is not True: hpo = hpo_queue.get() # 如果使用IP池,则不进行随机等待 s = random.randint(5, 10) print(hpo, "等待 " + str(s) + "秒") time.sleep(s) ip = randomIP() # ip = "socks5://127.0.0.1:1080" hpo_ip = hpo + "\t" + ip print(hpo_ip) options = EdgeOptions() options.use_chromium = True options.add_argument("headless") # options.add_argument("disable-gpu") options.add_argument("--proxy-server=http://{ip}".format(ip=ip)) options.add_argument("--disable-blink-features") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("start-maximized") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) geo = get_timezone_geolocation(ip) print(geo) geo_json = {"latitude": geo[1], "longitude": geo[2], "accuracy": 1} timezone = {"timezoneId": geo[0]} preferences = { "webrtc.ip_handling_policy": "disable_non_proxied_udp", "webrtc.multiple_routes_enabled": False, "webrtc.nonproxied_udp_enabled": False } options.add_experimental_option("prefs", preferences) msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe" driver = Edge(options=options, executable_path=msedge) script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" driver.execute_script(script) UA = UserAgent().random # UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36" driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": UA}) driver.execute_cdp_cmd("Emulation.setGeolocationOverride", geo_json) driver.execute_cdp_cmd("Emulation.setTimezoneOverride", timezone) print(driver.execute_script("return navigator.userAgent;")) hpid = hpo.split(":")[1] url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format( hpid=hpid) try: driver.get(url) strtemp = url print("网址:", strtemp) except Exception: print("get page error", hpo) time.sleep(2) with open("html2/hp_" + hpid + ".html", "a+", encoding="utf-8") as f: f.write(str(driver.page_source)) driver.close() fin = open("finish.txt", "a") fin.write(hpo + "\n") fin.close() size = getDocSize("html2/hp_" + hpid + ".html") if 9000 <= size <= 15000: checkIP = open("ip_check_better.txt", "a") checkIP.write(hpo_ip + "\n") checkIP.close()
def main(): args = sys.argv f = open(args[4], "r") Lines = f.readlines() names, profession, nationality, job = [], [], [], [] for line in Lines: array = line.split(",") names.append(array[0]) profession.append(array[1]) nationality.append(array[2]) job.append(array[3].replace("\n", "")) for name in names: print("Query:", name, ".\nProcessing...") user = '******' search_term = f'{name} filter:verified' options = EdgeOptions() options.use_chromium = True driver = Edge(options=options) driver.get('https://www.twitter.com/login') driver.maximize_window() sleep(2) username = driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') username.send_keys(user) password = driver.find_element_by_xpath( '//input[@name="session[password]"]') password.send_keys('donkey123') password.send_keys(Keys.RETURN) sleep(1) search_input = driver.find_element_by_xpath( '//input[@aria-label="Search query"]') search_input.send_keys(search_term) search_input.send_keys(Keys.RETURN) sleep(1) driver.find_element_by_link_text('People').click() sleep(3) driver.find_element_by_xpath( '//div[@class="css-1dbjc4n r-j7yic r-qklmqi r-1adg3ll r-1ny4l3l"]' ).click() sleep(3) data = [] tweet_data = [] start = 0 end = 500 for i in range(0, 5): sleep(1) cards = driver.find_elements_by_xpath( '//div[@data-testid="tweet"]') card = cards[i] tweet = get_tweet_data(card) for card in cards: data = get_tweet_data(card) if data: tweet_data.append(data) driver.execute_script(f'window.scrollTo({start},{end});') start += 500 end += 500 driver.close() tweets = set(tweet_data) write_to_csv(name, tweets) df = pd.read_csv(f'{name}.csv') Twitter_sentiment = Twitter_sentiment_model(df) Twitter_toxic = Twitter_toxic_model(df) Big5 = Big5_model(df) create_report(name, tweets, Twitter_sentiment, Twitter_toxic, Big5)
def SeleniumPapago(text=None): pptime = time() webElements = { "input-textbox": ["id", "txtSource"], "output-textbox": ["id", "txtTarget"], "translate-btn": ["id", "btnTranslate"], } def find_webelement(element): webElement = driver.find_element(element[0], element[1]) return webElement options = EdgeOptions() options.use_chromium = True options.add_argument("headless") options.add_argument("disable-gpu") options.add_argument("lang=ko_KR") options.add_argument( "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" ) driver = Edge(options=options) driver.get(f"https://papago.naver.com/?sk={srclang}&tk={tarlang}") ptime = time() try: wait = WebDriverWait(driver, timeout=15) wait.until( ec.visibility_of_element_located(webElements["translate-btn"])) except: # Waits for 15 seconds before Timeout exit("Webpage Timed out!") logging.info(f"time {time()-ptime}") input_textbox = find_webelement(webElements["input-textbox"]) output_textbox = find_webelement(webElements["output-textbox"]) translate_button = find_webelement(webElements["translate-btn"]) source = text # Creates a JavaScript to input the batch to the WebElement input_textbox script = "var ele = " + repr( source) + ";" + "\n document.getElementById('txtSource').value=ele;" driver.execute_script(script) input_textbox.send_keys(" ") sleep(0.1) translate_button.click() translated = "" wait = 0 while True: if wait == 4: wait = 0 sleep(0.05) translated = output_textbox.text logging.info(f"waiting{wait*'.'} ,{source}") wait += 1 if not (translated == "" or translated == None or translated == " "): logging.info(translated) break return translated
#img_str = base64.b64encode(buffered.getvalue()) images_women.append(src) driver.get(url_names) txt_box = driver.find_element_by_xpath('//*[@id="main"]/div/form/input[3]') txt_box.clear() txt_box.send_keys("95") # men select = Select(driver.find_element_by_xpath('//*[@id="gender"]')) select.select_by_visible_text('male') time.sleep(2) driver.find_element_by_xpath('//*[@id="qc-cmp2-ui"]/div[2]/div/button[2]').click() time.sleep(3) driver.execute_script("window.scrollTo(0, 1080)") driver.find_element_by_xpath('//*[@id="main"]/div/form/input[4]').click() time.sleep(5) names = driver.find_elements_by_class_name('name_heading') for name in names: names_men.append(name.text) # women select = Select(driver.find_element_by_xpath('//*[@id="gender"]')) select.select_by_visible_text('female') time.sleep(2) actions = ActionChains(driver) element = driver.find_element_by_xpath('//*[@id="main"]/div/form/input[4]') actions.move_to_element(element).click().perform() time.sleep(5) names = driver.find_elements_by_class_name('name_heading')
print("Getting atlas") # get opportunity atlas driver.get("https://opportunityatlas.org/") time.sleep(10) # click on get started atlas get_started = driver.find_element_by_xpath( "/html/body/div[3]/div[2]/div[1]/p[5]/button") get_started.click() # find element element = driver.find_element_by_id('introductionDialog') driver.execute_script( """ var element = arguments[0]; element.parentNode.removeChild(element); """, element) search_box = driver.find_element_by_xpath( '/html/body/div[2]/div[1]/div[2]/div[3]/div[2]/input') search_box.clear() search_box.send_keys(tract_id) time.sleep(2) search_box.send_keys(Keys.ENTER) time.sleep(10) driver.save_screenshot('test.png') #with io.BytesIO(driver.get_screenshot_as_png()) as f: # f.write('./test.png')
from msedge.selenium_tools import Edge, EdgeOptions from selenium.webdriver.common.keys import Keys import time import random options = EdgeOptions() options.use_chromium = True word_list=["stack overflow how to capture traffic", "java display thread", "p2p network"] driver = Edge(options=options) driver.get("https://www.google.com/") driver.maximize_window() time.sleep(1) inputElement = driver.find_element_by_id("zV9nZe") inputElement.click() inputElement = driver.find_element_by_xpath("/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input") inputElement.send_keys(random.choice(word_list), Keys.ENTER) element = driver.find_element_by_xpath("/html/body/div[7]/div/div[9]/div[1]/div/div[2]/div[2]/div/div/div/div[1]/div/div[1]/a") time.sleep(1) driver.get(element.get_attribute('href')) time.sleep(1) lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;") time.sleep(4) driver.close()
import random options = EdgeOptions() options.use_chromium = True options.add_argument("headless") # options.add_argument("disable-gpu") options.add_argument("--disable-blink-features") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("start-maximized") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe" driver = Edge(options=options, executable_path=msedge) script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" driver.execute_script(script) url = "https://www.deciphergenomics.org/genes" driver.get(url) print("网址:", url) # 等待加载 time.sleep(40) # 定位下拉选择框并选择100 driver.find_element_by_xpath( '//*[@id="content"]/div/div/div[2]/div/div/div[2]/div/div[1]/div/label/select/option[@value="100"]' ).click() time.sleep(10) # 保存第一页
def download(url): options = EdgeOptions() options.use_chromium =True # option = webdriver.ChromeOptions() # option.add_argument('headless') options.add_argument('log-level=3') driver = Edge(options=options) # driver = webdriver.Chrome( # executable_path='.//chromedriver', chrome_options=option) title = "output" try: driver.set_page_load_timeout(15) driver.get(url) title = driver.title except: print("Timeout - start download anyway.") print(f'道客巴巴: 《{title}》') time.sleep(5) try: # 展开全部 elem_cont_button = driver.find_element_by_id("continueButton") driver.execute_script( "arguments[0].scrollIntoView(true);", elem_cont_button) actions = ActionChains(driver) actions.move_to_element(elem_cont_button).perform() time.sleep(0.5) elem_cont_button.click() except NoSuchElementException: pass # 获取页数 num_of_pages = driver.find_element_by_id('readshop').find_element_by_class_name( 'mainpart').find_element_by_class_name('shop3').find_element_by_class_name('text').get_attribute('innerHTML') num_of_pages = int(num_of_pages.split(' ')[-1]) for i in range(5): # 缩放 driver.find_element_by_id('zoomInButton').click() time.sleep(0.5) if os.path.exists(f'./temp/{title}'): shutil.rmtree(f'./temp/{title}') os.makedirs(f'./temp/{title}') for pages in trange(num_of_pages): time.sleep(0.5) canvas_id = "page_" + str(pages + 1) pagepb_id = "pagepb_" + str(pages + 1) element = driver.find_element_by_id(canvas_id) driver.execute_script("arguments[0].scrollIntoView(true);", element) actions = ActionChains(driver) actions.move_to_element(element).perform() time.sleep(0.5) # Check loading status while(len(driver.find_element_by_id(pagepb_id).get_attribute('innerHTML')) != 0): time.sleep(1) # print(driver.find_element_by_id( # pagepb_id).get_attribute('innerHTML')) js_cmd = "var canvas = document.getElementById('{}');".format(canvas_id) + \ "return canvas.toDataURL();" img_data = driver.execute_script(js_cmd) img_data = (img_data[22:]).encode() with open(f"./temp/{title}/{pages}.png", "wb") as fh: fh.write(base64.decodebytes(img_data)) driver.quit() print('下载完毕,正在转码') conpdf(f'output/{title}.pdf', f'temp/{title}', '.png')
def scrape(secure=False): options = EdgeOptions() options.use_chromium = True driver = Edge(options=options) query = input("▁ ▂ ▄ ▅ ▆ ▇ █ 𝐄𝐧𝐭𝐞𝐫 𝐭𝐡𝐞 𝐓𝐞𝐱𝐭 𝐭𝐨 𝐬𝐞𝐚𝐫𝐜𝐡 █ ▇ ▆ ▅ ▄ ▂ ▁\n\n ") print("\n𝘚𝘵𝘢𝘳𝘵𝘦𝘥 𝘚𝘤𝘳𝘢𝘱𝘪𝘯𝘨 ↦↦↦↦↦↦↦↦↦↦") print("\nPlease Wait ............\n") driver.get("https://www.twitter.com/login") driver.maximize_window() username = driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') username.send_keys("*****@*****.**") #password=getpass() userpas = driver.find_element_by_xpath( '//input[@name="session[password]"]') userpas.send_keys('-----') userpas.send_keys(Keys.RETURN) sleep(2) if secure: username = driver.find_element_by_xpath( '//input[@name="session[username_or_email]"]') username.send_keys("031-----") userpas = driver.find_element_by_xpath( '//input[@name="session[password]"]') userpas.send_keys('----') userpas.send_keys(Keys.RETURN) sleep(2) search = driver.find_element_by_xpath( '//input[@aria-label="Search query"]') search.send_keys('"پاک فوج" lang:ur -filter:links filter:replies') search.send_keys(Keys.RETURN) sleep(1.5) driver.find_element_by_link_text("Latest").click() data = [] tweet_ids = set() last_position = driver.execute_script("return window.pageYOffset;") scrolling = True while scrolling: posts = driver.find_elements_by_xpath('//div[@data-testid="tweet"]') for post in posts[-15:]: tweet = scrap_tweets(post) if tweet: tweet_id = "".join(tweet) if tweet_id not in tweet_ids: tweet_ids.add(tweet_id) data.append(tweet) scroll_attempt = 0 while True: driver.execute_script( "window.scrollTo(0,document.body.scrollHeight);") sleep(1) curr_position = driver.execute_script("return window.pageYOffset;") if last_position == curr_position: scroll_attempt += 1 if scroll_attempt >= 3: scrolling = False break else: sleep(2) else: last_position = curr_position break return data
from selenium.webdriver.common.keys import Keys import time import random options = EdgeOptions() options.use_chromium = True word_list = [ "https://pt.wikipedia.org/wiki/Border_Gateway_Protocol", "https://pt.wikipedia.org/wiki/Multi_Protocol_Label_Switching", "https://pt.wikipedia.org/wiki/Open_Shortest_Path_First" ] driver = Edge(options=options) driver.get("https://pt.wikipedia.org/") driver.maximize_window() time.sleep(5) driver.get(random.choice(word_list)) time.sleep(4) driver.execute_script("window.scrollTo(0, 300)") time.sleep(4) driver.execute_script("window.scrollTo(300, 600)") time.sleep(4) driver.execute_script("window.scrollTo(600, 900)") time.sleep(4) driver.execute_script("window.scrollTo(900, 1200)") time.sleep(4) driver.execute_script("window.scrollTo(1200, 1500)") time.sleep(4) driver.close()
def main(): searchtext = input() num_requested = int(input()) number_of_scrolls = num_requested / 400 + 1 # number_of_scrolls * 400 images will be opened in the browser if not os.path.exists(download_path + searchtext.replace(" ", "_")): os.makedirs(download_path + searchtext.replace(" ", "_")) url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch" chrome_driver_path = "msedgedriver.exe" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge Beta\\Application\\msedge.exe" option = EdgeOptions() option.binary_location = browser_path driver = Edge(executable_path = chrome_driver_path, options = option) driver.get(url) headers = {} headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" extensions = {"jpg", "jpeg", "png", "gif"} img_count = 0 downloaded_img_count = 0 for _ in range(int(number_of_scrolls)): for __ in range(15): driver.execute_script("window.scrollBy(0, 1000000)") time.sleep(0.2) time.sleep(0.5) try: driver.find_element_by_xpath( "//input[@value='Show more results']").click() except Exception as e: print("Less images found: {}".format(e)) break html = driver.page_source.split('"') imges = [] links = [] for i in html: if i.startswith('https:') and ('gstatic' not in i) and ('google' not in i): links.append(i.split('"')[0]) for i in html: if i.startswith('http') and 'usqp=CAU' in i.split('.')[-1]: imges.append(i.split('"')[0]) for i in html: if i.startswith('http') and i.split('"')[0].split('.')[-1] in extensions: imges.append(i.split('"')[0]) links = list(set(links)) imges = list(set(imges)) print(imges) links_left = Diff(links, imges) #removing duplicates urls_new = [] [urls_new.append(x) for x in links_left if x not in urls_new] file1 = open("page_source.txt", "w", encoding='utf8') file1.writelines(urls_new) img_type = [] print("Total images: {}\n".format(len(imges))) for img in imges: img_count += 1 print("Downloading image {}:{}".format(img_count, img)) img_type = img.rsplit('.', 1) try: req = Request(img, headers=headers) raw_img = urlopen(req).read() f = open(download_path+searchtext.replace(" ", "_")+"/" + str(downloaded_img_count)+"."+"jpeg", "wb") f.write(raw_img) f.close downloaded_img_count += 1 except Exception as e: print("Download failed: {}".format(e)) finally: print if downloaded_img_count >= num_requested: break print("Total downloaded: {}/{}".format(downloaded_img_count, img_count)) print("Total images: {}\n".format(len(urls_new))) for url in urls_new: img_count = img_scp.img_download(url, download_path+searchtext.replace(" ", "_")+"/", img_count) driver.quit()
class QCourse: def __init__(self): # 初始化options self.prefs = {"download.default_directory": os.getcwd()} self.options = EdgeOptions() self.options.use_chromium = True self.options.add_argument("log-level=3") self.options.add_experimental_option('excludeSwitches', ['enable-logging']) self.options.add_experimental_option('prefs', self.prefs) self.options.add_argument("--mute-audio") self.login_url = 'https://ke.qq.com/' # Mac 下配置 options 报错,故扔掉了。如果是 Windows,请使用路径下面的 msedgedriver.exe。(注释掉下面一行,放开下下行) self.driver = Edge(executable_path=os.path.join( BASE_DIR, 'msedgedriver'), capabilities={}) # self.driver = Edge(executable_path='msedgedriver.exe', options=self.options) # self.driver = Edge(executable_path=os.path.join(BASE_DIR, 'msedgedriver'), capabilities=desired_cap, options=self.options) def login(self): self.driver.get('https://ke.qq.com/') self.driver.find_element_by_id('js_login').click() time.sleep(1) WebDriverWait(self.driver, 300).until_not( EC.presence_of_element_located((By.CLASS_NAME, 'ptlogin-mask'))) dictCookies = self.driver.get_cookies() jsonCookies = json.dumps(dictCookies) with open('cookies.json', 'w') as f: f.write(jsonCookies) print('登陆成功!') def close(self): self.driver.close() def _get_video(self, video_url=None, path=None, index=None): if not video_url: print('请输入视频url!') # 跳转一次没法跳转,可能是设置了preventDefault self.driver.get(video_url) self.driver.get(video_url) try: # 等待视频开始播放 WebDriverWait(self.driver, 60).until( EC.presence_of_element_located((By.CLASS_NAME, 'loki-time'))) WebDriverWait( self.driver, 60).until_not(lambda driver: driver.find_element_by_class_name( 'loki-time').get_attribute("innerHTML") == '00:00 / 00:00') title = self.driver.title if index is not None: title = "{:02}_{}".format(index, title) networks = self.driver.execute_script( 'return window.performance.getEntries()') ts_url = key_url = '' for network in networks: if '.ts?start' in network.get('name'): ts_url = network.get('name') elif 'get_dk' in network.get('name'): key_url = network.get('name') download_single(ts_url, key_url, title, path) except TimeoutException: # 如果超时,可能是下载的资料,则查看是否有下载按钮,有的话,就下载 title = self.driver.title try: down_btn = self.driver.find_element_by_class_name( 'download-btn') if down_btn.text == '下载资料': url = down_btn.get_attribute('href') download_zip_doc(url, title, path) except Exception: print('没有找到视频,也没有找到可下载的文件,可能是还未开课') def get_video(self, video_url=None, path=None, index=None): if isinstance(video_url, list): for url in video_url: if url: self._get_video(url, path, index) else: self._get_video(video_url, path, index) def load_cookies(self): if not os.path.exists('cookies.json'): self.login() with open('cookies.json', 'r') as f: listCookies = json.loads(f.read()) self.driver.get(self.login_url) for cookie in listCookies: self.driver.add_cookie({ 'domain': '.ke.qq.com', 'httpOnly': cookie['httpOnly'], 'name': cookie['name'], 'path': '/', 'secure': cookie['secure'], 'value': cookie['value'] }) for cookie in utils.get_cookies_dic_list(): self.driver.add_cookie({ 'domain': '.ke.qq.com', 'httpOnly': False, 'name': cookie[0], 'path': '/', 'secure': False, 'value': cookie[1] })
class TwitterBot(): def __init__(self): self.driver = Edge() self.driver.maximize_window() self.driver.get('https://twitter.com') self.driver.implicitly_wait(3) def goToTwitter(self): self.driver.get('https://twitter.com') def login(self): self.driver.find_element_by_xpath("//a[@href='/login']").click() #I used sleep because before this time there is another instance of an element named like below. #It is crucial to get the right element in order to interact with it. sleep(1) self.driver.find_element_by_xpath( "//input[@name='session[username_or_email]']").send_keys(username) self.driver.find_element_by_xpath( "//input[@name='session[password]']").send_keys(password) self.driver.find_element_by_xpath( "//div[@data-testid='LoginForm_Login_Button']").click() def basicSearch(self, topic): self.driver.find_element_by_xpath( "//input[@data-testid='SearchBox_Search_Input']").send_keys(topic) self.driver.find_element_by_xpath( "//input[@data-testid='SearchBox_Search_Input']").submit() def advancedSearch(self, exact, any, none, hashtags, dateFrom, dateTo): finalSearch = '' #This is to accommodate for different search types that a user might want. if exact != None: finalSearch += '"' + exact + '" ' if any != None: finalSearch += '(' + any + ') ' if none != None: finalSearch += '-' + none + ' ' if hashtags != None: finalSearch += '(#' + hashtags + ') ' if dateTo != None: finalSearch += 'until:' + dateTo + ' ' if dateFrom != None: finalSearch += 'since:' + dateFrom + ' ' self.driver.find_element_by_xpath( "//input[@data-testid='SearchBox_Search_Input']").send_keys( finalSearch) self.driver.find_element_by_xpath( "//input[@data-testid='SearchBox_Search_Input']").submit() def scrapeTweets(self, desiredNum): allLines = '' oldDataLines = [] dataLines = ['init'] tweetsFile = open('tweets.csv', 'w') #I included this array to help clean data later dirtyArray = [ 'Quote Tweet', 'Promoted', 'Show this thread', '', '\n', ' ' ] numDataLines = 0 while numDataLines < desiredNum and oldDataLines != dataLines: oldDataLines = dataLines sleep(1) #all these are different types of data that I do not want to pick up. dirtyData = self.driver.find_elements_by_xpath( "//div[@class='css-1dbjc4n r-1d09ksm r-18u37iz r-1wbh5a2']") dirtyData2 = self.driver.find_elements_by_xpath( "//div[@class = 'css-1dbjc4n r-18u37iz r-1wtj0ep r-156q2ks r-1mdbhws']" ) dirtyData3 = self.driver.find_elements_by_xpath( "//div[contains(text(),'Replying to')]") dirtyData4 = self.driver.find_elements_by_xpath( "//div[@role = 'blockquote']") #adding all the dirty data into one array for dirt in dirtyData2: dirtyData.append(dirt) for dirt in dirtyData3: dirtyData.append(dirt) for dirt in dirtyData4: dirtyData.append(dirt) #the data is stored with strings with many lines so I split the strings up by line and have an array where each index is one lin dirtyLines = [] for dirt in dirtyData: dirt = dirt.text chunks = dirt.split('\n') for chunk in chunks: dirtyLines.append(chunk) #this includes dirty data that will be weeded out later data = self.driver.find_elements_by_xpath( "//div[@data-testid='tweet']") #same thing I did with dirtyLines dataLines = [] for datapoint in data: datapoint = datapoint.text chunks = datapoint.split('\n') for chunk in chunks: dataLines.append(chunk) #I check oldDataLines as well to avoid redundancy for line in dataLines: if line not in dirtyLines and line not in oldDataLines and line not in dirtyArray: if numDataLines >= desiredNum: break try: noPunctuationLine = re.sub(r'[^\w\s]', '', line) tweetsFile.write(noPunctuationLine) tweetsFile.write("\n") allLines += line numDataLines += 1 except Exception: print('This data point not encodable.') height = self.driver.execute_script( "return document.documentElement.scrollHeight") self.driver.execute_script("window.scrollTo(0, " + str(height) + ");") tweetsFile.close() return allLines
password.send_keys(my_password) password.send_keys(Keys.RETURN) sleep(1) # find search input and search for term search_input = driver.find_element_by_xpath('//input[@aria-label="Search query"]') search_input.send_keys(search_term) search_input.send_keys(Keys.RETURN) sleep(1) # navigate to historical 'latest' tab driver.find_element_by_link_text('Latest').click() # get all tweets on the page tweet_ids = set() last_position = driver.execute_script("return window.pageYOffset;") scrolling = True while scrolling: page_cards = driver.find_elements_by_xpath('//div[@data-testid="tweet"]') for card in page_cards[-15:]: if card is not None: tweet = get_tweet_data(card) if tweet is not None: tweetL = list(tweet) tweetL.append(x) tweetL.append(j) tweet = tuple(tweetL) if tweet: if tweet[2]<cutoff_date: scrolling=False else:
from msedge.selenium_tools import Edge, EdgeOptions from selenium.webdriver.common.keys import Keys import time options = EdgeOptions() options.use_chromium = True driver = Edge(options=options) driver.maximize_window() driver.get("https://www.youtube.com/") time.sleep(1) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") actions = driver.find_element_by_tag_name('body'); actions.send_keys(Keys.TAB * 4, Keys.ENTER) time.sleep(1) driver.get("https://www.youtube.com/results?search_query=Drive+Drive+Drive+song+(Impractical+Jokers)+-+2+HOUR+VERSION") time.sleep(5) inputElement = driver.find_element_by_xpath("/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-video-renderer[1]") inputElement.click() time.sleep(400) driver.close()