Beispiel #1
0
 
 # find search input and search for term
 search_input = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
 search_input.send_keys(search_term)
 search_input.send_keys(Keys.RETURN)
 sleep(1)
 
 # navigate to historical 'latest' tab
 driver.find_element_by_link_text('Latest').click()
 
 # get all tweets on the page
 tweet_ids = set()
 last_position = driver.execute_script("return window.pageYOffset;")
 scrolling = True
 while scrolling:
     page_cards = driver.find_elements_by_xpath('//div[@data-testid="tweet"]')
     for card in page_cards[-15:]:
         if card is not None:
             tweet = get_tweet_data(card)
             if tweet is not None:
                 tweetL = list(tweet)
                 tweetL.append(x)
                 tweetL.append(j)
                 tweet = tuple(tweetL)
                 if tweet:
                     if tweet[2]<cutoff_date:
                         scrolling=False
                     else:
                         idtweet = list(tweet)
                         idtweet[2]=str(idtweet[2])
                         tweet_id = ''.join(idtweet)
Beispiel #2
0
class ReservationEngine:
    def __init__(self, email, password, headless=True):
        self.email = email
        self.password = password
        self.available = False
        self.booked = False
        self.reservations_left = False
        options = EdgeOptions()
        options.add_argument("--log-level=3")
        options.use_chromium = True
        if headless:
            options.add_argument("headless")
        self.driver = Edge(options=options)
        print("Starting web driver...")

    def remove_overlay(self):
        #get rid of cc overlay
        buttons = self.driver.find_elements_by_css_selector("a.cc-btn")
        while any(map(lambda x: x.size["height"] != 0, buttons)):
            for button in buttons:
                try:
                    button.click()
                except:
                    pass
            buttons = self.driver.find_elements_by_css_selector("a.cc-btn")

    def login(self):
        #login
        print("Logging in")
        self.driver.get(
            "https://account.ikonpass.com/en/login?redirect_uri=/en/myaccount/add-reservations/"
        )
        self.remove_overlay()
        email_box = self.driver.find_element_by_css_selector("input#email")
        email_box.send_keys(self.email)
        password_box = self.driver.find_element_by_css_selector(
            "input#sign-in-password")
        password_box.send_keys(self.password)
        submit = self.driver.find_element_by_css_selector("button.submit")
        submit.click()
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'input.react-autosuggest__input')))
        print("Logged in")

    def refresh(self):
        self.driver.refresh()

    def find_date(self, date, resort):
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'input.react-autosuggest__input')))
        self.remove_overlay()
        #select resort
        search = self.driver.find_element_by_css_selector(
            "input.react-autosuggest__input")
        search.send_keys(resort)
        button = self.driver.find_element_by_css_selector(
            "li#react-autowhatever-resort-picker-section-1-item-0")
        button.click()
        button = self.driver.find_element_by_xpath(
            "//span[contains(text(), 'Continue')]")
        button.click()

        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'div.DayPicker-wrapper')))
        self.remove_overlay()

        #select date
        datepicker = self.driver.find_element_by_css_selector(
            "div.DayPicker-wrapper")
        month_selected = False
        while not month_selected:
            month_text = calendar.month_name[date.month]
            month = datepicker.find_elements_by_xpath(
                "//span[contains(text(), " + "'" + month_text + "')]")
            if len(month) > 0:
                month_selected = True
            else:
                button = datepicker.find_element_by_class_name(
                    "icon-chevron-right")
                button.click()

        day = datepicker.find_element_by_xpath("//div[@aria-label='" +
                                               date.strftime("%a %b %d %Y") +
                                               "']")
        day.click()
        day_classes = day.get_attribute(name="class")

        self.available = "past" not in day_classes and "unavailable" not in day_classes
        self.booked = "confirmed" in day_classes
        div = self.driver.find_elements_by_xpath(
            "//div[contains(text(), 'Reservation Limit Reached')]")
        self.reservations_left = len(div) == 0
        print("Date Selected: " + date.strftime("%m/%d/%Y"))

    def reserve(self):
        #confirm reservation if available
        if self.available and not self.booked and self.reservations_left:
            self.remove_overlay()
            button = self.driver.find_element_by_xpath(
                "//span[contains(text(), 'Save')]")
            button.click()
            button = self.driver.find_element_by_xpath(
                "//span[contains(text(), 'Continue to Confirm')]")
            button.click()

            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//input[@type='checkbox']")))
            button = self.driver.find_element_by_xpath(
                "//input[@type='checkbox']")
            button.click()
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH,
                     "//span[contains(text(), 'Confirm Reservations')]")))
            button = self.driver.find_element_by_xpath(
                "//span[contains(text(), 'Confirm Reservations')]")
            button.click()
            self.booked = True
            print("Booked")
        return self.booked

    def log_results(self, log_file_name):
        #log
        with open(log_file_name, "a") as f:
            f.write(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
            f.write(": Available - %r, Booked - %r, Reservations Left- %r" %
                    (self.available, self.booked, self.reservations_left))
            f.write("\n")

    def close_driver(self):
        self.driver.close()
Beispiel #3
0
                edge_options.add_argument(
                    'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
                     AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63"'
                )
                #edge_options.binary_location = executable_path
                driver = Edge(executable_path=executable_path,
                              options=edge_options)

                driver.maximize_window()
                driver.get(url)

                #sleep(1)

                # 自动输入学号和密码
                driver.find_elements_by_xpath(
                    '//*[@id="app"]/div/div[3]/div[1]/input')[0].send_keys(
                        your_id)
                driver.find_elements_by_xpath(
                    '//*[@id="app"]/div/div[3]/div[2]/input')[0].send_keys(
                        your_password)
                # driver.find_elements_by_xpath('//*[@id="app"]/div/div[3]/button')[0].click()

                for i in range(10):
                    try:
                        # 获取验证码 处理验证码
                        print('第%i次尝试' % (i + 1))
                        driver.find_elements_by_xpath(
                            '//*[@id="app"]/div/div[3]/div[4]')[0].click()

                        sleep(0.5)
options = EdgeOptions()
options.use_chromium = True
unpacked_extension_path = os.path.join(os.getcwd(), "markdown-clipper")
options.add_argument("--load-extension={}".format(unpacked_extension_path))
download_path = os.path.join(os.getcwd(), "Markdown Output\\")
prefs = {
    "download.default_directory": download_path,
    "profile.default_content_settings.popups": 0,
    "directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
options.add_experimental_option("detach", True)
driver = Edge(options=options)

url = "https://networklessons.com/cisco/ccna-200-301"
driver.get(url)

# In[43]:

agree = driver.find_elements_by_xpath('//*[@id="catapult-cookie-bar"]/div/div')
join = driver.find_elements_by_xpath(
    '//*[@id="om-mvhsujbebu4nqhlzcsgs-optin"]/div/button')

if agree:
    agree[0].click()

if join:
    join[0].click()

# In[ ]:
Beispiel #5
0
class Web_scraping:
    def __init__(self):
        '''Initialize the application'''
        #As using the standard webdriver was giving warnings and messing up the terminal, I used the code below to show just what I want.
        self.opt = EdgeOptions()
        self.opt.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.opt.add_argument("--start-maximized")
        self.opt.use_chromium = True
        self.driver = Edge(
            executable_path=
            r"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedgedriver.exe",
            options=self.opt)

    def games_link(self):
        '''Create a list with all season event's link and then create another list with all event's link'''
        #Creating list with the all season's link
        self.season_pages_list = []
        for y in range(2008, 2022):
            #Creating the seasons links as str and adding it to a list
            self.season_link = 'https://www.worldsurfleague.com/events/' + str(
                y) + '/mct?all=1'
            self.season_pages_list.append(self.season_link)

        #Creating a list with the all event's link from each season
        self.events_link_list = []
        for link in self.season_pages_list:
            self.driver.get(link)
            #Getting all the events links as selenium format
            self.event_links = self.driver.find_elements_by_xpath(
                '//a[@class="event-schedule-details__event-name"]')
            #Finding the class status completed is needed once it's possible to stop the process on it.
            self.event_status = self.driver.find_elements_by_xpath(
                '//span[@class="event-status event-status--completed"]')

            #Creating event's link list
            for i in range(0, len(self.event_status)):
                #Getting the links for each event as a str format
                self.link_attribute = self.event_links[i].get_attribute('href')
                self.events_link_list.append(self.link_attribute)

        with open('events.txt', 'w') as f:
            for item in self.events_link_list:
                f.write("%s\n" % item)

        print('FINISHED')

    #Getting data inside which event
    def event_stats(self):
        #TXT file with all events link to list
        self.events_link = [
            line[0]
            for line in pd.read_fwf('events.txt', header=None).values.tolist()
        ]

        #for link in self.events_link:
        self.driver.get(self.events_link[0])

        #list of all heats
        self.all_heats_lists = []

        while True:
            #Gets all the waves scores, athletes, nationalities and heats on the page as list.
            self.waves = self.driver.find_elements_by_xpath(
                '//*[@class="score"]')
            self.athletes = self.driver.find_elements_by_xpath(
                '//*[@class="athlete-name"]')
            self.nationalities = self.driver.find_elements_by_xpath(
                '//*[@class="athlete-country-flag"]')
            self.heat = self.driver.find_elements_by_xpath(
                '//*[@class="new-heat-hd-name"]')

            #Gets the round name
            self.round = self.driver.find_elements_by_xpath(
                '//*[@class="carousel-item is-selected"]')
            if len(self.round) == 0:
                self.round = self.driver.find_elements_by_xpath(
                    '//*[@class="carousel-item last is-selected"]')

            #Gets the number of surfers and heats on the round, such as the avg surfers per heat (must be 2 or 3)
            self.number_of_surfers = int(len(self.waves) / 18)

            #As the final round only has 1 heat, the find_element_by_class_name gets a 'WebDriver' element and not a list
            self.number_of_heats = len(self.heat)

            self.surfers_per_heat = int(self.number_of_surfers /
                                        self.number_of_heats)

            #there's a count to deduct 1 stage and gets the round name for each round.
            self.count = 0
            #Gets the stats for each heat
            self.heat_data = []
            for g in range(0, self.number_of_heats):
                #Page stats
                #Event stats
                self.event_turn = self.driver.find_element_by_class_name(
                    'event-meta-tour-info').text.split()[2][1:]
                self.event_period = self.driver.find_element_by_class_name(
                    'event-schedule__date-range').text
                self.event_name = self.driver.find_element_by_class_name(
                    'event-title').text.split('\n')[0]
                self.event_local = re.split(
                    r'(\d+)',
                    self.driver.find_element_by_class_name(
                        'event-meta-tour-info').text)[2]
                self.avg_wave_score = re.split(
                    r'(\d+\.\d+)',
                    self.driver.find_element_by_class_name(
                        'new-heat-hd-status').text)[1]

                #Heat's id for the database
                self.heat_id = (f'heat{g + 1}' + self.round[0].text +
                                self.event_turn +
                                self.event_period[-4:]).lower()

                #Surfer stats

                self.surfer1 = self.athletes[g * 2].text
                self.surfer1_nat = self.nationalities[g *
                                                      2].get_attribute('title')

                self.surfer1_best_w1 = self.waves[g * 18 + (1 - 1)].text
                self.surfer1_best_w2 = self.waves[g * 18 + (2 - 1)].text
                self.surfer1_total = self.waves[g * 18 + (3 - 1)].text
                self.surfer1_w01 = self.waves[g * 18 + (4 - 1)].text
                self.surfer1_w02 = self.waves[g * 18 + (5 - 1)].text
                self.surfer1_w03 = self.waves[g * 18 + (6 - 1)].text
                self.surfer1_w04 = self.waves[g * 18 + (7 - 1)].text
                self.surfer1_w05 = self.waves[g * 18 + (8 - 1)].text
                self.surfer1_w06 = self.waves[g * 18 + (9 - 1)].text
                self.surfer1_w07 = self.waves[g * 18 + (10 - 1)].text
                self.surfer1_w08 = self.waves[g * 18 + (11 - 1)].text
                self.surfer1_w09 = self.waves[g * 18 + (12 - 1)].text
                self.surfer1_w10 = self.waves[g * 18 + (13 - 1)].text
                self.surfer1_w11 = self.waves[g * 18 + (14 - 1)].text
                self.surfer1_w12 = self.waves[g * 18 + (15 - 1)].text
                self.surfer1_w13 = self.waves[g * 18 + (16 - 1)].text
                self.surfer1_w14 = self.waves[g * 18 + (17 - 1)].text
                self.surfer1_w15 = self.waves[g * 18 + (18 - 1)].text

                #Surfer 2 stats
                self.surfer2 = self.athletes[g * 2 + 1].text
                self.surfer2_nat = self.nationalities[g * 2 +
                                                      1].get_attribute('title')

                self.surfer2_best_w1 = self.waves[g * 18 + (19 - 1)].text
                self.surfer2_best_w2 = self.waves[g * 18 + (20 - 1)].text
                self.surfer2_total = self.waves[g * 18 + (21 - 1)].text
                self.surfer2_w01 = self.waves[g * 18 + (22 - 1)].text
                self.surfer2_w02 = self.waves[g * 18 + (23 - 1)].text
                self.surfer2_w03 = self.waves[g * 18 + (24 - 1)].text
                self.surfer2_w04 = self.waves[g * 18 + (25 - 1)].text
                self.surfer2_w05 = self.waves[g * 18 + (26 - 1)].text
                self.surfer2_w06 = self.waves[g * 18 + (27 - 1)].text
                self.surfer2_w07 = self.waves[g * 18 + (28 - 1)].text
                self.surfer2_w08 = self.waves[g * 18 + (29 - 1)].text
                self.surfer2_w09 = self.waves[g * 18 + (30 - 1)].text
                self.surfer2_w10 = self.waves[g * 18 + (31 - 1)].text
                self.surfer2_w11 = self.waves[g * 18 + (32 - 1)].text
                self.surfer2_w12 = self.waves[g * 18 + (33 - 1)].text
                self.surfer2_w13 = self.waves[g * 18 + (34 - 1)].text
                self.surfer2_w14 = self.waves[g * 18 + (35 - 1)].text
                self.surfer2_w15 = self.waves[g * 18 + (36 - 1)].text

                #Inputing all variables into the heat_data list
                self.heat_data.append(self.heat_id)
                self.heat_data.append(self.event_name)
                self.heat_data.append(self.event_local)
                self.heat_data.append(self.event_turn)
                self.heat_data.append(self.event_period)
                self.heat_data.append(self.avg_wave_score)
                self.heat_data.append(self.surfer1)
                self.heat_data.append(self.surfer1_nat)
                self.heat_data.append(self.surfer1_best_w1)
                self.heat_data.append(self.surfer1_best_w2)
                self.heat_data.append(self.surfer1_total)
                self.heat_data.append(self.surfer1_w01)
                self.heat_data.append(self.surfer1_w02)
                self.heat_data.append(self.surfer1_w03)
                self.heat_data.append(self.surfer1_w04)
                self.heat_data.append(self.surfer1_w05)
                self.heat_data.append(self.surfer1_w06)
                self.heat_data.append(self.surfer1_w07)
                self.heat_data.append(self.surfer1_w08)
                self.heat_data.append(self.surfer1_w09)
                self.heat_data.append(self.surfer1_w10)
                self.heat_data.append(self.surfer1_w11)
                self.heat_data.append(self.surfer1_w12)
                self.heat_data.append(self.surfer1_w13)
                self.heat_data.append(self.surfer1_w14)
                self.heat_data.append(self.surfer1_w15)
                self.heat_data.append(self.surfer2)
                self.heat_data.append(self.surfer2_nat)
                self.heat_data.append(self.surfer2_best_w1)
                self.heat_data.append(self.surfer2_best_w2)
                self.heat_data.append(self.surfer2_total)
                self.heat_data.append(self.surfer2_w01)
                self.heat_data.append(self.surfer2_w02)
                self.heat_data.append(self.surfer2_w03)
                self.heat_data.append(self.surfer2_w04)
                self.heat_data.append(self.surfer2_w05)
                self.heat_data.append(self.surfer2_w06)
                self.heat_data.append(self.surfer2_w07)
                self.heat_data.append(self.surfer2_w08)
                self.heat_data.append(self.surfer2_w09)
                self.heat_data.append(self.surfer2_w10)
                self.heat_data.append(self.surfer2_w11)
                self.heat_data.append(self.surfer2_w12)
                self.heat_data.append(self.surfer2_w13)
                self.heat_data.append(self.surfer2_w14)
                self.heat_data.append(self.surfer2_w15)
                self.all_heats_lists.append(self.heat_data.copy())
                self.heat_data.clear()

            #Click on the previous round botton
            print(self.all_heats_lists)
            try:
                self.prev_round_bt = self.driver.find_element_by_xpath(
                    '//*[@class="flickity-button-icon"]').click()
            except:
                self.prev_round_bt = self.driver.find_element_by_xpath(
                    '//*[@class="flickity-button-icon"]')
                self.driver.execute_script("arguments[0].scrollIntoView();",
                                           self.prev_round_bt)
                time.sleep(.5)
                self.prev_round_bt.click()
            time.sleep(2.5)
Beispiel #6
0
users = []
emails = []
passwords = []

options = EdgeOptions()
options.use_chromium = True
options.add_argument('headless')

driver = Edge(executable_path='C:\Program Files\msedgedriver.exe',
              options=options)

driver.get(url)

time.sleep(5)

lis = driver.find_elements_by_xpath(
    "/html/body/div/div[1]/main/article/div[2]/ol/li")

for li in lis:
    emails.append(li.text)

for email in emails:
    pwd = email.split('@')[0]
    pwd += '1234'
    pwd = hashlib.md5(pwd.encode())
    passwords.append(pwd.hexdigest())

#-------------------------------------------------------

client = pymongo.MongoClient(
    "mongodb+srv://sgdb:[email protected]/Music?retryWrites=true&w=majority"
)
Beispiel #7
0
class Session:
    def __init__(self, username, password, sleep_time=2):
        self.username = username
        self.password = password
        self.sleep_time = sleep_time
        options = EdgeOptions()
        options.use_chromium = True
        self.driver = Edge(options=options)

    def login(self):
        self.driver.get("https://www.twitter.com/login")
        sleep(self.sleep_time)
        u_name = self.driver.find_element_by_xpath(
            '//input[@name="session[username_or_email]"]')
        u_name.send_keys(self.username)
        p_word = self.driver.find_element_by_xpath(
            '//input[@name="session[password]"]')
        p_word.send_keys(self.password)
        p_word.send_keys(Keys.RETURN)
        sleep(self.sleep_time)

    def tweet_selection(self, search_str, csv_tit, max_tweets=300):
        sleep(self.sleep_time)
        search_input = self.driver.find_element_by_xpath(
            '//input[@aria-label="Search query"]')
        search_input.clear()
        search_input.send_keys(search_str)
        search_input.send_keys(Keys.RETURN)
        sleep(self.sleep_time)
        data = []
        tweet_ids = set()
        last_pos = self.driver.execute_script("return window.pageYOffset;")
        scrolling = True
        while scrolling:
            cards = self.driver.find_elements_by_xpath(
                '//div[@data-testid="tweet"]')
            for card in cards[-15:]:
                tweet = self.get_tweet_data(card)
                if tweet:
                    tweet_id = ''.join(tweet)
                    if tweet_id not in tweet_ids:
                        tweet_ids.add(tweet_id)
                        data.append(tweet)
            scroll_attempt = 0
            while True:
                self.driver.execute_script(
                    'window.scrollTo(0, document.body.scrollHeight);')
                sleep(self.sleep_time)
                curr_pos = self.driver.execute_script(
                    "return window.pageYOffset;")
                if last_pos == curr_pos:
                    scroll_attempt += 1
                    if scroll_attempt >= 3:
                        scrolling = False
                        break
                    else:
                        sleep(2 * self.sleep_time)
                else:
                    last_pos = curr_pos
                    break
        with open(csv_tit, 'w', encoding="utf-8") as out:
            csv_out = csv.writer(out)
            csv_out.writerow([
                'user', 'date', 'text', 'quoting', 'reply count',
                'retweet count', 'like count'
            ])
            for row in data:
                csv_out.writerow(row)

    def get_tweet_data(self, card):
        user = card.find_element_by_xpath('.//span[contains(text(),"@")]').text
        try:
            date = card.find_element_by_xpath('.//time').get_attribute(
                'datetime')
        except NoSuchElementException:
            return
        text = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
        responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text
        reply_count = card.find_element_by_xpath(
            './/div[@data-testid="reply"]').text
        retweet_count = card.find_element_by_xpath(
            './/div[@data-testid="retweet"]').text
        like_count = card.find_element_by_xpath(
            './/div[@data-testid="like"]').text
        tweet = (user, date, text, responding, reply_count, retweet_count,
                 like_count)
        return tweet

    def tweet(self, tuit):  # REQUIERE INTERACTUAR CON EDGE
        sleep(self.sleep_time)
        tuit_input = self.driver.find_element_by_xpath(
            '//div[@data-testid="tweetTextarea_0"]')
        tuit_input.clear()
        tuit_input.send_keys(tuit)
class TwitterBot():
    def __init__(self):
        self.driver = Edge()
        self.driver.maximize_window()
        self.driver.get('https://twitter.com')
        self.driver.implicitly_wait(3)

    def goToTwitter(self):
        self.driver.get('https://twitter.com')

    def login(self):
        self.driver.find_element_by_xpath("//a[@href='/login']").click()

        #I used sleep because before this time there is another instance of an element named like below.
        #It is crucial to get the right element in order to interact with it.
        sleep(1)
        self.driver.find_element_by_xpath(
            "//input[@name='session[username_or_email]']").send_keys(username)
        self.driver.find_element_by_xpath(
            "//input[@name='session[password]']").send_keys(password)

        self.driver.find_element_by_xpath(
            "//div[@data-testid='LoginForm_Login_Button']").click()

    def basicSearch(self, topic):
        self.driver.find_element_by_xpath(
            "//input[@data-testid='SearchBox_Search_Input']").send_keys(topic)
        self.driver.find_element_by_xpath(
            "//input[@data-testid='SearchBox_Search_Input']").submit()

    def advancedSearch(self, exact, any, none, hashtags, dateFrom, dateTo):
        finalSearch = ''
        #This is to accommodate for different search types that a user might want.
        if exact != None:
            finalSearch += '"' + exact + '" '
        if any != None:
            finalSearch += '(' + any + ') '
        if none != None:
            finalSearch += '-' + none + ' '
        if hashtags != None:
            finalSearch += '(#' + hashtags + ') '
        if dateTo != None:
            finalSearch += 'until:' + dateTo + ' '
        if dateFrom != None:
            finalSearch += 'since:' + dateFrom + ' '

        self.driver.find_element_by_xpath(
            "//input[@data-testid='SearchBox_Search_Input']").send_keys(
                finalSearch)
        self.driver.find_element_by_xpath(
            "//input[@data-testid='SearchBox_Search_Input']").submit()

    def scrapeTweets(self, desiredNum):
        allLines = ''
        oldDataLines = []
        dataLines = ['init']
        tweetsFile = open('tweets.csv', 'w')

        #I included this array to help clean data later
        dirtyArray = [
            'Quote Tweet', 'Promoted', 'Show this thread', '', '\n', ' '
        ]
        numDataLines = 0
        while numDataLines < desiredNum and oldDataLines != dataLines:

            oldDataLines = dataLines
            sleep(1)
            #all these are different types of data that I do not want to pick up.
            dirtyData = self.driver.find_elements_by_xpath(
                "//div[@class='css-1dbjc4n r-1d09ksm r-18u37iz r-1wbh5a2']")
            dirtyData2 = self.driver.find_elements_by_xpath(
                "//div[@class = 'css-1dbjc4n r-18u37iz r-1wtj0ep r-156q2ks r-1mdbhws']"
            )
            dirtyData3 = self.driver.find_elements_by_xpath(
                "//div[contains(text(),'Replying to')]")
            dirtyData4 = self.driver.find_elements_by_xpath(
                "//div[@role = 'blockquote']")

            #adding all the dirty data into one array
            for dirt in dirtyData2:
                dirtyData.append(dirt)

            for dirt in dirtyData3:
                dirtyData.append(dirt)

            for dirt in dirtyData4:
                dirtyData.append(dirt)

            #the data is stored with strings with many lines so I split the strings up by line and have an array where each index is one lin
            dirtyLines = []
            for dirt in dirtyData:
                dirt = dirt.text
                chunks = dirt.split('\n')
                for chunk in chunks:
                    dirtyLines.append(chunk)

            #this includes dirty data that will be weeded out later
            data = self.driver.find_elements_by_xpath(
                "//div[@data-testid='tweet']")

            #same thing I did with dirtyLines
            dataLines = []
            for datapoint in data:
                datapoint = datapoint.text
                chunks = datapoint.split('\n')
                for chunk in chunks:
                    dataLines.append(chunk)

            #I check oldDataLines as well to avoid redundancy
            for line in dataLines:
                if line not in dirtyLines and line not in oldDataLines and line not in dirtyArray:
                    if numDataLines >= desiredNum:
                        break
                    try:
                        noPunctuationLine = re.sub(r'[^\w\s]', '', line)
                        tweetsFile.write(noPunctuationLine)
                        tweetsFile.write("\n")
                        allLines += line
                        numDataLines += 1
                    except Exception:
                        print('This data point not encodable.')

            height = self.driver.execute_script(
                "return document.documentElement.scrollHeight")
            self.driver.execute_script("window.scrollTo(0, " + str(height) +
                                       ");")

        tweetsFile.close()
        return allLines
Beispiel #9
0
def scrape(secure=False):

    options = EdgeOptions()
    options.use_chromium = True
    driver = Edge(options=options)

    query = input("▁ ▂ ▄ ▅ ▆ ▇ █ 𝐄𝐧𝐭𝐞𝐫 𝐭𝐡𝐞 𝐓𝐞𝐱𝐭 𝐭𝐨 𝐬𝐞𝐚𝐫𝐜𝐡 █ ▇ ▆ ▅ ▄ ▂ ▁\n\n ")

    print("\n𝘚𝘵𝘢𝘳𝘵𝘦𝘥 𝘚𝘤𝘳𝘢𝘱𝘪𝘯𝘨 ↦↦↦↦↦↦↦↦↦↦")
    print("\nPlease Wait ............\n")

    driver.get("https://www.twitter.com/login")
    driver.maximize_window()

    username = driver.find_element_by_xpath(
        '//input[@name="session[username_or_email]"]')
    username.send_keys("*****@*****.**")
    #password=getpass()

    userpas = driver.find_element_by_xpath(
        '//input[@name="session[password]"]')
    userpas.send_keys('-----')
    userpas.send_keys(Keys.RETURN)
    sleep(2)

    if secure:
        username = driver.find_element_by_xpath(
            '//input[@name="session[username_or_email]"]')
        username.send_keys("031-----")

        userpas = driver.find_element_by_xpath(
            '//input[@name="session[password]"]')
        userpas.send_keys('----')
        userpas.send_keys(Keys.RETURN)
        sleep(2)

    search = driver.find_element_by_xpath(
        '//input[@aria-label="Search query"]')
    search.send_keys('"پاک فوج" lang:ur -filter:links filter:replies')
    search.send_keys(Keys.RETURN)
    sleep(1.5)
    driver.find_element_by_link_text("Latest").click()
    data = []
    tweet_ids = set()
    last_position = driver.execute_script("return window.pageYOffset;")
    scrolling = True

    while scrolling:
        posts = driver.find_elements_by_xpath('//div[@data-testid="tweet"]')
        for post in posts[-15:]:
            tweet = scrap_tweets(post)
            if tweet:
                tweet_id = "".join(tweet)
                if tweet_id not in tweet_ids:
                    tweet_ids.add(tweet_id)
                    data.append(tweet)

        scroll_attempt = 0
        while True:
            driver.execute_script(
                "window.scrollTo(0,document.body.scrollHeight);")
            sleep(1)

            curr_position = driver.execute_script("return window.pageYOffset;")
            if last_position == curr_position:
                scroll_attempt += 1

                if scroll_attempt >= 3:
                    scrolling = False
                    break

                else:
                    sleep(2)
            else:
                last_position = curr_position
                break
    return data
Beispiel #10
0
def main():
    args = sys.argv
    f = open(args[4], "r")
    Lines = f.readlines()
    names, profession, nationality, job = [], [], [], []
    for line in Lines:
        array = line.split(",")
        names.append(array[0])
        profession.append(array[1])
        nationality.append(array[2])
        job.append(array[3].replace("\n", ""))
    for name in names:
        print("Query:", name, ".\nProcessing...")
        user = '******'
        search_term = f'{name} filter:verified'
        options = EdgeOptions()
        options.use_chromium = True
        driver = Edge(options=options)
        driver.get('https://www.twitter.com/login')
        driver.maximize_window()
        sleep(2)
        username = driver.find_element_by_xpath(
            '//input[@name="session[username_or_email]"]')
        username.send_keys(user)
        password = driver.find_element_by_xpath(
            '//input[@name="session[password]"]')
        password.send_keys('donkey123')
        password.send_keys(Keys.RETURN)
        sleep(1)
        search_input = driver.find_element_by_xpath(
            '//input[@aria-label="Search query"]')
        search_input.send_keys(search_term)
        search_input.send_keys(Keys.RETURN)
        sleep(1)
        driver.find_element_by_link_text('People').click()
        sleep(3)
        driver.find_element_by_xpath(
            '//div[@class="css-1dbjc4n r-j7yic r-qklmqi r-1adg3ll r-1ny4l3l"]'
        ).click()
        sleep(3)
        data = []
        tweet_data = []
        start = 0
        end = 500
        for i in range(0, 5):
            sleep(1)
            cards = driver.find_elements_by_xpath(
                '//div[@data-testid="tweet"]')
            card = cards[i]
            tweet = get_tweet_data(card)
            for card in cards:
                data = get_tweet_data(card)
                if data:
                    tweet_data.append(data)
            driver.execute_script(f'window.scrollTo({start},{end});')
            start += 500
            end += 500
        driver.close()
        tweets = set(tweet_data)
        write_to_csv(name, tweets)
        df = pd.read_csv(f'{name}.csv')
        Twitter_sentiment = Twitter_sentiment_model(df)
        Twitter_toxic = Twitter_toxic_model(df)
        Big5 = Big5_model(df)

        create_report(name, tweets, Twitter_sentiment, Twitter_toxic, Big5)
Beispiel #11
0
class Sei:

    __area_inicial = None
    __windows_before = 0
    __windows_after = 0

    def __init__(self, headless=False, executable_path='chromedriver'):
        if 'chromedriver' in executable_path:
            chrome_options = Options()
            chrome_options.add_argument('--enable-javascript')
            chrome_options.add_argument('--window-size=1440,900')
            chrome_options.add_argument("--disable-extensions")
            chrome_options.add_argument("--proxy-server='direct://'")
            chrome_options.add_argument("--proxy-bypass-list=*")
            chrome_options.add_argument("--start-maximized")
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--ignore-certificate-errors')
            if headless:
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--disable-gpu')
            self.driver = webdriver.Chrome(executable_path=executable_path,
                                           options=chrome_options)
        elif 'msedgedriver' in executable_path:
            edge_options = EdgeOptions()
            edge_options.use_chromium = True
            edge_options.add_argument('enable-javascript')
            edge_options.add_argument('window-size=1440,900')
            edge_options.add_argument("disable-extensions")
            edge_options.add_argument("proxy-server='direct://'")
            edge_options.add_argument("proxy-bypass-list=*")
            edge_options.add_argument("start-maximized")
            edge_options.add_argument('disable-dev-shm-usage')
            edge_options.add_argument('no-sandbox')
            edge_options.add_argument('ignore-certificate-errors')
            if headless:
                edge_options.add_argument('headless')
                edge_options.add_argument('disable-gpu')
            self.driver = Edge(executable_path=executable_path,
                               options=edge_options)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def start_driver(self, url, usuario=None, senha=None):

        if usuario == None:
            usuario = input('Digite o usuário: ')
        if senha == None:
            senha = getpass('Digite a senha: ')

        self.driver.get(url)

        usuario_field = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "txtUsuario")))

        senha_field = self.driver.find_element_by_id('pwdSenha')
        botao_acessar = self.driver.find_element_by_id('sbmLogin')

        usuario_field.clear()
        usuario_field.send_keys(usuario)
        senha_field.clear()
        senha_field.send_keys(senha)
        botao_acessar.click()
        alerta = self.fechar_alerta()
        if alerta:
            raise Exception(alerta)  # usuário ou senha inválido
        self.__area_incial = self.get_area()

    def go_to(self, numero_sei):
        if self.__windows_after > self.__windows_before:
            self.driver.close()
            self.driver.switch_to.window(
                self.driver.window_handles[self.__windows_before - 1])
        self.driver.switch_to.default_content()
        pesquisa = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "txtPesquisaRapida")))
        pesquisa.clear()
        pesquisa.send_keys(str(numero_sei))
        formPesquisaRapida = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located(
                (By.ID, "frmProtocoloPesquisaRapida")))
        self.__windows_before = len(self.driver.window_handles)
        formPesquisaRapida.submit()
        self.__windows_after = len(self.driver.window_handles)
        if self.__windows_after > self.__windows_before:
            self.driver.switch_to.window(
                self.driver.window_handles[self.__windows_after - 1])

    def is_processo_aberto(self, area=None, processo=None):
        if processo:
            self.go_to(processo)
        else:
            self.driver.switch_to.default_content()
        try:
            ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
            self.driver.switch_to.frame(ifrVisualizacao)
            informacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "divInformacao")))
            mensagem = informacao.text
            aberto = 'aberto' in mensagem
            if area:
                regex = '(?im)^(.*)(' + area + ')[^0-9a-z](.*)$'
                matches = search(regex, mensagem)
                if matches:
                    aberto = True
                else:
                    aberto = False
            self.driver.switch_to.default_content()
        except:
            aberto = None
            mensagem = 'Impossível abrir mensagem do processo'
        return aberto, mensagem

    def get_processo_anexador(self, processo=None):
        if processo:
            self.go_to(processo)
        else:
            self.driver.switch_to.default_content()
        ifrVisualizacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
        self.driver.switch_to.frame(ifrVisualizacao)
        informacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "divInformacao")))
        procAnex = None
        if 'Processo anexado ao processo' in informacao.text:
            processoAnexador = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//*[@id=\"divInformacao\"]/div/a")))
            procAnex = processoAnexador.text
        self.driver.switch_to.default_content()
        return procAnex

    def get_area(self):
        self.driver.switch_to.default_content()
        select = Select(self.driver.find_element_by_id('selInfraUnidades'))
        return select.all_selected_options[0].text

    def seleciona_area(self, area):
        self.driver.switch_to.default_content()
        select = Select(self.driver.find_element_by_id('selInfraUnidades'))
        all_selected_options = select.all_selected_options
        for option in all_selected_options:
            if area == option.text:
                return True

        select = Select(self.driver.find_element_by_id('selInfraUnidades'))
        options = select.options
        for option in options:
            if area == option.text:
                select.select_by_visible_text(area)
                Select(
                    WebDriverWait(self.driver, 3).until(
                        EC.presence_of_element_located(
                            (By.ID, 'selInfraUnidades'))))
                return True

        return False

    def clicar_botao(self, botao):
        self.driver.switch_to.default_content()
        ifrVisualizacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
        self.driver.switch_to.frame(ifrVisualizacao)
        arvore = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "divArvoreAcoes")))
        botoes = arvore.find_elements(By.XPATH,
                                      '//*[@id=\"divArvoreAcoes\"]/a')

        for b in botoes:
            img = b.find_element(By.XPATH, 'img')
            if botao in img.get_attribute('title'):
                b.click()
                try:
                    WebDriverWait(self.driver, 1).until(
                        EC.alert_is_present(),
                        'Timed out waiting for PA creation ' +
                        'confirmation popup to appear.')
                except:
                    try:
                        self.driver.switch_to.default_content()
                    except:
                        None
                return True
        return False

    def fechar_alerta(self):
        alerta = None
        try:
            WebDriverWait(self.driver, 3).until(
                EC.alert_is_present(), 'Timed out waiting for PA creation ' +
                'confirmation popup to appear.')
            alert = self.driver.switch_to.alert
            alerta = alert.text
            alert.accept()
            self.driver.switch_to.default_content()
        except TimeoutException:
            None
        return alerta

    def is_sobrestado(self, area=None, processo=None):
        if processo:
            self.go_to(processo)
        else:
            self.driver.switch_to.default_content()
        ifrVisualizacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
        self.driver.switch_to.frame(ifrVisualizacao)
        informacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "divInformacao")))
        sobrestado = 'sobrestado' in informacao.text
        mensagem = informacao.text
        self.driver.switch_to.default_content()
        if area:
            regex = '(?im)^(.*)(' + area + ')[^0-9a-z](.*)$'
            matches = search(regex, informacao.text)
            return sobrestado, matches != None
        else:
            return sobrestado, mensagem

    def sobrestar_processo(self, motivo, processo=None):
        if processo:
            self.go_to(processo)
        else:
            self.driver.switch_to.default_content()
        if self.clicar_botao('Sobrestar Processo'):
            ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
            self.driver.switch_to.frame(ifrVisualizacao)
            self.driver.find_element(By.ID, 'divOptSomenteSobrestar').click()
            motivoField = self.driver.find_element(By.ID, 'txaMotivo')
            motivoField.clear()
            motivoField.send_keys(motivo)
            self.driver.find_element(By.ID, 'sbmSalvar').click()
            self.driver.switch_to.default_content()
            return True
        return False

    def remover_sobrestamento(self, processo=None):
        if processo:
            self.go_to(processo)
        if self.clicar_botao('Remover Sobrestamento do Processo'):
            self.fechar_alerta()
            return True
        return False

    def publicar(self,
                 resumo_ementa,
                 data_disponibilizacao,
                 documento=None,
                 dou=False,
                 secao=None,
                 pagina=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        if self.clicar_botao('Agendar Publicação'):
            ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
            self.driver.switch_to.frame(ifrVisualizacao)

            resumo_ementa_text_field = self.driver.find_element(
                By.ID, 'txaResumo')
            resumo_ementa_text_field.clear()
            resumo_ementa_text_field.send_keys(resumo_ementa)

            disponibilizacao = self.driver.find_element(
                By.ID, 'txtDisponibilizacao')
            disponibilizacao.clear()
            disponibilizacao.send_keys(data_disponibilizacao)

            if dou:
                select = Select(self.driver.find_element_by_id('selVeiculoIO'))
                select.select_by_visible_text('DOU')

                select = Select(
                    WebDriverWait(self.driver, 3).until(
                        EC.presence_of_element_located((By.ID, "selSecaoIO"))))
                WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR,
                         "option[value='" + secao if secao else '3' + "']")))
                select.select_by_visible_text(secao if secao else '3')

                pagina_text_field = self.driver.find_element(
                    By.ID, 'txtPaginaIO')
                pagina_text_field.clear()
                pagina_text_field.send_keys(pagina if pagina else '')

                disponibilizacao = self.driver.find_element(By.ID, 'txtDataIO')
                disponibilizacao.clear()
                disponibilizacao.send_keys(data_disponibilizacao)

            self.driver.find_element_by_id('btnSalvar').click()

            self.driver.switch_to.default_content()
            return True
        return False

    def get_conteudo_documento(self, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
            self.driver.switch_to.frame(ifrVisualizacao)
            ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
            self.driver.switch_to.frame(ifrArvoreHtml)
            documento_conteudo = self.driver.find_element_by_xpath(
                '/html/body').get_attribute('innerHTML')
            documento_conteudo = sub(
                r'\\n', '', documento_conteudo)  # retirar quebra de páginas
            documento_conteudo = sub(r'\s\s+?', ' ',
                                     documento_conteudo)  # tira espaços duplos
            documento_conteudo = sub(r'&nbsp;', ' ',
                                     documento_conteudo)  # tira espaços duplos
            documento_conteudo = documento_conteudo.strip(
            )  # retirar quebras de páginas que tenham restado
            return documento_conteudo
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def get_documento_element_by_id(self, id, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            if (self.__windows_after == self.__windows_before):
                ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
                self.driver.switch_to.frame(ifrVisualizacao)
                ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
                self.driver.switch_to.frame(ifrArvoreHtml)
            return self.driver.find_element_by_id(id).text
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def get_documento_elements_by_id(self, id, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            if (self.__windows_after == self.__windows_before):
                ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
                self.driver.switch_to.frame(ifrVisualizacao)
                ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
                self.driver.switch_to.frame(ifrArvoreHtml)
            elements = self.driver.find_elements_by_id(id)
            return [element.text for element in elements]
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def get_documento_element_by_xpath(self, xpath, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            if (self.__windows_after == self.__windows_before):
                ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
                self.driver.switch_to.frame(ifrVisualizacao)
                ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
                self.driver.switch_to.frame(ifrArvoreHtml)
            return self.driver.find_element_by_xpath(xpath).text
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def get_documento_elements_by_xpath(self, xpath, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            if (self.__windows_after == self.__windows_before):
                ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
                self.driver.switch_to.frame(ifrVisualizacao)
                ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
                self.driver.switch_to.frame(ifrArvoreHtml)
            elements = self.driver.find_elements_by_xpath(xpath)
            return [element.text for element in elements]
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def close(self, voltar=True):
        if voltar:
            self.seleciona_area(self.__area_incial)
        self.driver.close()
        self.driver.quit()