def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None, experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True): self.batch_num = batch_num self.site_num = site_num self.instance_num = instance_num self.page_url = page_url self.bg_site = bg_site self.experiment = experiment self.base_dir = base_dir self.visit_dir = None self.visit_log_dir = None self.tbb_version = cm.RECOMMENDED_TBB_VERSION self.capture_screen = capture_screen self.tor_controller = tor_controller self.xvfb = xvfb self.init_visit_dir() self.pcap_path = os.path.join( self.visit_dir, "{}.pcap".format(self.get_instance_name())) if self.xvfb and not cm.running_in_CI: wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H)) self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H) self.vdisplay.start() # Create new instance of TorBrowser driver TorBrowserDriver.add_exception(self.page_url) self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH, tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log")) self.sniffer = Sniffer() # sniffer to capture the network traffic
class TorBrowserWrapper(object): """Wraps the TorBrowserDriver to configure it at the constructor and run it with the `launch` method. We might consider to change the TorBrowserDriver itself to follow torcontroller and stem behaviour: init configures and a method is used to launch driver/controller, and this method is the one used to implement the contextmanager. """ def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs self.driver = None def __getattr__(self, item): if self.driver is None: return if item == "launch": return getattr(self, item) return getattr(self.driver, item) @contextmanager def launch(self): self.driver = TorBrowserDriver(*self.args, **self.kwargs) yield self.driver self.driver.quit()
def test_close_all_streams(self): streams_open = False new_tb_drv = TorBrowserDriver(cm.TBB_PATH) new_tb_drv.get('http://www.google.com') time.sleep(30) self.tor_controller.close_all_streams() for stream in self.tor_controller.controller.get_streams(): print stream.id, stream.purpose, stream.target_address, "open!" streams_open = True new_tb_drv.quit() self.assertFalse(streams_open, 'Could not close all streams.')
def __init__(self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt","tbb","tor-browser_en-US"), tb_log_path=join(_log_dir,"firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config(config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data)
def restart_tb(self): """Restarts the Tor Browser.""" self.logger.info("Restarting the Tor Browser...") self.tb_driver.quit() self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=USE_RUNNING_TOR, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.logger.info("Tor Browser restarted...")
class RunDriverWithControllerTest(unittest.TestCase): """ This test shows how to run tor with TorController and browse with TorBrowserDriver. """ @unittest.skip("Only for didactic purposes.") def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController(cm.TBB_PATH, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_PATH, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.kill_tor_proc()
def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController(cm.TBB_PATH, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_PATH, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.kill_tor_proc()
def prepare_driver(disable_cookies=False, tor=False, v=False, headless=False): """Prepares a Selenium webdriver given multiple args. Parameters ---------- disable_cookies : Boolean True to use a driver in incognito mode with cookies disables. tor : Boolean True to use a Tor webdriver. v : Boolean verbosity. headless : Boolean True to set the webdriver headless, which means not showing the Firefox window. Returns ------- WebDriver A selenium or tbselenium webdriver. xvfb_display The Xvfb process for hiding the tbselenium webdriver. tor_process The Stem process for running the tbselenium webdriver. """ options = Options() if headless and v: print("Setting headless mode...") options.headless = headless if disable_cookies: firefox_profile = webdriver.FirefoxProfile() # set incognito mode firefox_profile.set_preference("browser.privatebrowsing.autostart", True) # disable cookies firefox_profile.set_preference("network.cookie.cookieBehavior", 2) driver = webdriver.Firefox(options=options, firefox_profile=firefox_profile) elif not tor: driver = webdriver.Firefox(options=options) else: if v: print("Configuring tor browser...") tbb_dir = Driver.TOR_PATH if headless: xvfb_display = start_xvfb() try: tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir) except OSError as e: if 'timeout' in str(e): print( 'Error: Tor connection timeout. Check URL or Internet connection' ) return None, None, None else: raise e # Tor driver constructor driver = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) if headless: return driver, xvfb_display, tor_process else: return driver, None, tor_process return driver, None, None
import os from datetime import datetime from tbselenium.tbdriver import TorBrowserDriver from xvfbwrapper import Xvfb tor_dir = '../tor-browser-patched/Primary/' vdisplay = Xvfb() vdisplay.start() # open list of urls for testing with open('alexa-top-1000.txt', 'r') as url_file: test_urls = url_file.readlines() driver = TorBrowserDriver(tor_dir) #, pref_dict=rfp) driver.set_page_load_timeout(15) # do 10 runs uses = 0 notUses = 0 inconclusive = 0 for i, url in enumerate(test_urls): try: # request url from list #print("Fetching " + str(url),end='') url = 'https://' + url driver.get(url) # pull window.performance.timing after loading the page and add information about url and number of run perf_timings = driver.execute_script( "return window.performance.getEntries()")
class FunctionalTest(object): gpg = None new_totp = None session_expiration = 30 secret_message = "These documents outline a major government invasion of privacy." timeout = 10 poll_frequency = 0.1 accept_languages = None default_driver_name = TORBROWSER driver = None firefox_driver = None torbrowser_driver = None driver_retry_count = 3 driver_retry_interval = 5 def _unused_port(self): s = socket.socket() s.bind(("127.0.0.1", 0)) port = s.getsockname()[1] s.close() return port def set_tbb_securitylevel(self, level): if level not in {SECURITY_HIGH, SECURITY_MEDIUM, SECURITY_LOW}: raise ValueError("Invalid Tor Browser security setting: " + str(level)) if hasattr(self, 'torbrowser_driver'): set_security_level(self.torbrowser_driver, level) def create_torbrowser_driver(self): logging.info("Creating TorBrowserDriver") log_file = open(LOGFILE_PATH, "a") log_file.write("\n\n[%s] Running Functional Tests\n" % str(datetime.now())) log_file.flush() # Don't use Tor when reading from localhost, and turn off private # browsing. We need to turn off private browsing because we won't be # able to access the browser's cookies in private browsing mode. Since # we use session cookies in SD anyway (in private browsing mode all # cookies are set as session cookies), this should not affect session # lifetime. pref_dict = { "network.proxy.no_proxies_on": "127.0.0.1", "browser.privatebrowsing.autostart": False, } if self.accept_languages is not None: pref_dict["intl.accept_languages"] = self.accept_languages for i in range(self.driver_retry_count): try: self.torbrowser_driver = TorBrowserDriver( TBB_PATH, tor_cfg=cm.USE_RUNNING_TOR, pref_dict=pref_dict, tbb_logfile_path=LOGFILE_PATH, ) logging.info("Created Tor Browser web driver") self.torbrowser_driver.set_window_position(0, 0) self.torbrowser_driver.set_window_size(1024, 1200) break except Exception as e: logging.error("Error creating Tor Browser web driver: %s", e) if i < self.driver_retry_count: time.sleep(self.driver_retry_interval) if not self.torbrowser_driver: raise Exception("Could not create Tor Browser web driver") def create_firefox_driver(self): logging.info("Creating Firefox web driver") profile = webdriver.FirefoxProfile() if self.accept_languages is not None: profile.set_preference("intl.accept_languages", self.accept_languages) profile.update_preferences() for i in range(self.driver_retry_count): try: self.firefox_driver = webdriver.Firefox( firefox_binary=FIREFOX_PATH, firefox_profile=profile) self.firefox_driver.set_window_position(0, 0) self.firefox_driver.set_window_size(1024, 1200) logging.info("Created Firefox web driver") break except Exception as e: logging.error("Error creating Firefox web driver: %s", e) if i < self.driver_retry_count: time.sleep(self.driver_retry_interval) if not self.firefox_driver: raise Exception("Could not create Firefox web driver") def switch_to_firefox_driver(self): if not self.firefox_driver: self.create_firefox_driver() self.driver = self.firefox_driver logging.info("Switched %s to Firefox driver: %s", self, self.driver) def switch_to_torbrowser_driver(self): if self.torbrowser_driver is None: self.create_torbrowser_driver() self.driver = self.torbrowser_driver logging.info("Switched %s to TorBrowser driver: %s", self, self.driver) def disable_js_torbrowser_driver(self): if hasattr(self, 'torbrowser_driver'): disable_js(self.torbrowser_driver) def start_source_server(self, source_port): config.SESSION_EXPIRATION_MINUTES = self.session_expiration / 60.0 self.source_app.run(port=source_port, debug=True, use_reloader=False, threaded=True) @pytest.fixture(autouse=True) def set_default_driver(self): logging.info("Creating default web driver: %s", self.default_driver_name) if self.default_driver_name == FIREFOX: self.switch_to_firefox_driver() else: self.switch_to_torbrowser_driver() yield try: if self.torbrowser_driver: self.torbrowser_driver.quit() except Exception as e: logging.error("Error stopping TorBrowser driver: %s", e) try: if self.firefox_driver: self.firefox_driver.quit() except Exception as e: logging.error("Error stopping Firefox driver: %s", e) @pytest.fixture(autouse=True) def sd_servers(self): logging.info("Starting SecureDrop servers (session expiration = %s)", self.session_expiration) # Patch the two-factor verification to avoid intermittent errors logging.info("Mocking models.Journalist.verify_token") with mock.patch("models.Journalist.verify_token", return_value=True): logging.info("Mocking source_app.main.get_entropy_estimate") with mock.patch("source_app.main.get_entropy_estimate", return_value=8192): try: signal.signal(signal.SIGUSR1, lambda _, s: traceback.print_stack(s)) source_port = self._unused_port() journalist_port = self._unused_port() self.source_location = "http://127.0.0.1:%d" % source_port self.journalist_location = "http://127.0.0.1:%d" % journalist_port self.source_app = source_app.create_app(config) self.journalist_app = journalist_app.create_app(config) self.journalist_app.config["WTF_CSRF_ENABLED"] = True self.__context = self.journalist_app.app_context() self.__context.push() env.create_directories() db.create_all() self.gpg = env.init_gpg() # Add our test user try: valid_password = "******" user = Journalist(username="******", password=valid_password, is_admin=True) user.otp_secret = "HFFHBDSUGYSAHGYCVSHVSYVCZGVSG" db.session.add(user) db.session.commit() except IntegrityError: logging.error("Test user already added") db.session.rollback() # This user is required for our tests cases to login self.admin_user = { "name": "journalist", "password": ("correct horse battery staple" " profanity oil chewy"), "secret": "HFFHBDSUGYSAHGYCVSHVSYVCZGVSG", } self.admin_user["totp"] = pyotp.TOTP( self.admin_user["secret"]) def start_journalist_server(app): app.run(port=journalist_port, debug=True, use_reloader=False, threaded=True) self.source_process = Process( target=lambda: self.start_source_server(source_port)) self.journalist_process = Process( target=lambda: start_journalist_server(self. journalist_app)) self.source_process.start() self.journalist_process.start() for tick in range(30): try: requests.get(self.source_location, timeout=1) requests.get(self.journalist_location, timeout=1) except Exception: time.sleep(0.25) else: break yield finally: try: self.source_process.terminate() except Exception as e: logging.error("Error stopping source app: %s", e) try: self.journalist_process.terminate() except Exception as e: logging.error("Error stopping source app: %s", e) env.teardown() self.__context.pop() def wait_for_source_key(self, source_name): filesystem_id = self.source_app.crypto_util.hash_codename(source_name) def key_available(filesystem_id): assert self.source_app.crypto_util.get_fingerprint(filesystem_id) self.wait_for(lambda: key_available(filesystem_id), timeout=60) def create_new_totp(self, secret): self.new_totp = pyotp.TOTP(secret) def wait_for(self, function_with_assertion, timeout=None): """Polling wait for an arbitrary assertion.""" # Thanks to # http://chimera.labs.oreilly.com/books/1234000000754/ch20.html#_a_common_selenium_problem_race_conditions if timeout is None: timeout = self.timeout start_time = time.time() while time.time() - start_time < timeout: try: return function_with_assertion() except (AssertionError, WebDriverException): time.sleep(self.poll_frequency) # one more try, which will raise any errors if they are outstanding return function_with_assertion() def safe_click_by_id(self, element_id): """ Clicks the element with the given ID attribute. Returns: el: The element, if found. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ el = WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.ID, element_id))) el.location_once_scrolled_into_view el.click() return el def safe_click_by_css_selector(self, selector): """ Clicks the first element with the given CSS selector. Returns: el: The element, if found. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ el = WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.CSS_SELECTOR, selector))) el.click() return el def safe_click_all_by_css_selector(self, selector, root=None): """ Clicks each element that matches the given CSS selector. Returns: els (list): The list of elements that matched the selector. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ if root is None: root = self.driver els = self.wait_for( lambda: root.find_elements_by_css_selector(selector)) for el in els: clickable_el = WebDriverWait( self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.CSS_SELECTOR, selector))) clickable_el.click() return els def safe_send_keys_by_id(self, element_id, text): """ Sends the given text to the element with the specified ID. Returns: el: The element, if found. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ el = WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.ID, element_id))) el.send_keys(text) return el def safe_send_keys_by_css_selector(self, selector, text): """ Sends the given text to the first element with the given CSS selector. Returns: el: The element, if found. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ el = WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.CSS_SELECTOR, selector))) el.send_keys(text) return el def alert_wait(self, timeout=None): if timeout is None: timeout = self.timeout * 10 WebDriverWait(self.driver, timeout, self.poll_frequency).until( expected_conditions.alert_is_present(), "Timed out waiting for confirmation popup.") def alert_accept(self): # adapted from https://stackoverflow.com/a/34795883/837471 def alert_is_not_present(object): """ Expect an alert to not be present.""" try: alert = self.driver.switch_to.alert alert.text return False except NoAlertPresentException: return True self.driver.switch_to.alert.accept() WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( alert_is_not_present, "Timed out waiting for confirmation popup to disappear.")
def launch(self): self.driver = TorBrowserDriver(*self.args, **self.kwargs) yield self.driver self.driver.quit()
if os.path.isfile(file_path): file_info = os.stat(file_path) return convert_bytes(file_info.st_size) files = {'file': open('random.jpg', 'rb')} values = {'nickname': 'anon', 'email': ''} sys.stderr = DevNull() global driver imagestring = randomString() + ".jpg" RANDOMSTRING=binascii.b2a_hex(os.urandom(32)) JPG="ffd8ffdb" with open(imagestring, "wb") as f: f.write(binascii.unhexlify(JPG + RANDOMSTRING)) try: with TorBrowserDriver(cfg.c["tor_directory"]) as driver: driver.load_url("http://REMOVED.onion/", wait_for_page_body=True) captcha = driver.find_element_by_xpath('//img[@src="Generate_Captcha.php"]') inputcaptcha = driver.find_element_by_xpath('//input[@name="captcha_code"]') inputfile = driver.find_element_by_xpath('//input[@name="file"]') sendurl = driver.find_element_by_xpath('/html/body/div/div[4]/div/form/div/input[5]') location = captcha.location size = captcha.size driver.save_screenshot("temp.jpg") x = location['x'] y = location['y'] width = location['x']+size['width'] height = location['y']+size['height'] im = Image.open('temp.jpg') im = im.convert("L") im = im.crop((int(x), int(y), int(width), int(height)))
from tbselenium.tbdriver import TorBrowserDriver from os.path import dirname, join, realpath, getsize out_img = join(dirname(realpath(__file__)), "screenshot.png") with TorBrowserDriver( "/home/manivannan/pythonexamle/selenium_example/tor-browser_en-US" ) as driver: driver.load_url('https://check.torproject.org', wait_for_page_body=True) print("----" * 100) driver.get_screenshot_as_file(out_img) print("----" * 100) print("Screenshot is saved as %s (%s bytes)" % (out_img, getsize(out_img))) # driver.get()
def get_pos_data(): # Gets postive examples of different ram titles from different retailers # Left off on Kingston HyperX Fury RGB 32 GB (line 224 of ram_links.txt) # Left off on Intel Core i3-3240 Dual-Core Processor 3.4 Ghz (line 106 of cpu_links.txt) file_name = input('What file do you want to open? ') csv_name = input('What would you like the finished CSV to be? ') link_file = open('data/pcpartpicker_misc/{}.txt'.format(file_name), 'r') retailer_names = [ 'amazon', 'bestbuy', 'newegg', 'walmart', 'memoryc', 'bhphotovideo' ] df = pd.DataFrame(columns=retailer_names) links = list(link_file) try: for link in links[51:]: link = link.strip() # Change the IP that Tor gives us switchIP() soup = None with TorBrowserDriver( '/home/jason/.local/share/torbrowser/tbb/x86_64/tor-browser_en-US/' ) as driver: driver.get(link) time.sleep(15) soup = BeautifulSoup(driver.page_source, 'lxml') title_dict = dict( zip(retailer_names, ['' for x in range(len(retailer_names))])) for retailer in soup.find_all('td', attrs={'class': 'td__logo'}): link = 'https://www.pcpartpicker.com' + retailer.find( 'a')['href'] soup = None for name in retailer_names: if name in link: if name == 'adorama': # switchIP() # with TorBrowserDriver('/home/jason/.local/share/torbrowser/tbb/x86_64/tor-browser_en-US/') as driver: # driver.get(link) # time.sleep(10) # soup = BeautifulSoup(driver.page_source, 'lxml') # #soup = BeautifulSoup(driver.page_source, 'lxml') # driver.quit() pass else: driver = webdriver.Chrome() driver.get(link) soup = BeautifulSoup(driver.page_source, 'lxml') driver.quit() try: if 'amazon' in link: title_dict['amazon'] = soup.find( 'span', attrs={ 'id': 'productTitle' }).text.strip() print('amazon', title_dict['amazon']) elif 'bestbuy' in link: title_dict['bestbuy'] = soup.find( 'h1', attrs={ 'class': 'heading-5 v-fw-regular' }).text.strip() print('bestbuy', title_dict['bestbuy']) elif 'newegg' in link: title_dict['newegg'] = soup.find( 'h1', attrs={ 'id': 'grpDescrip_h' }).text.strip() print('newegg', title_dict['newegg']) elif 'walmart' in link: title_dict['walmart'] = soup.find( 'h1', attrs={ 'class': 'prod-ProductTitle prod-productTitle-buyBox font-bold' }).text.strip() print('walmart', title_dict['walmart']) elif 'memoryc' in link: title_dict['memoryc'] = soup.find( 'section', attrs={ 'class': 'forCartImageItem' }).find('h1').text.strip() print('memoryc', title_dict['memoryc']) elif 'bhphotovideo' in link: title_dict['bhphotovideo'] = soup.find( 'h1', { 'data-selenium': 'productTitle' }).text.strip() print('bhphotovideo', title_dict['bhphotovideo']) # elif 'adorama' in link: # title_dict['adorama'] = soup.find('div', attrs={'class': 'primary-info cf clear'}).find('h1').find('span').text.strip() # print('adorama', title_dict['adorama']) else: continue except Exception: pass df = df.append( pd.DataFrame([list(title_dict.values())], columns=retailer_names)) except (Exception, KeyboardInterrupt) as e: print(str(e)) print('here') df.to_csv( '/home/jason/Documents/Supervised-Product-Similarity/data/train/{}.csv' .format(csv_name)) link_file.close()
def test_should_raise_for_invalid_tor_config(self): with self.assertRaises(TBDriverConfigError): TorBrowserDriver(TBB_PATH, tor_cfg=-1)
class TruliaHelper(): def __init__(self): self.url = 'https://www.trulia.com' # need to set Tor Browser path here. tbpath = "/home/gc14/Documents/softwares/tor-browser_en-US" self.driver = TorBrowserDriver(tbb_path=tbpath, tbb_logfile_path='test.log') # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary) # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options) # method to get items from given link. def getItems(self): items = [] # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA'] keywords = ['512 W 10th St Perris CA 92570'] * 2 for keyword in keywords: self.driver.get(self.url) search_box = self.driver.find_element_by_id( "homepageSearchBoxTextInput") search_box.clear() search_box.send_keys(keyword) search_btn = self.driver.find_element_by_xpath( "//button[@data-auto-test-id='searchButton']") if search_btn: print("Going to click") search_btn.click() time.sleep(10) items.append(self.getItemDetail()) self.driver.close() return items def getItemDetail(self): data = {} try: soup = BeautifulSoup(self.driver.page_source, u'html.parser') image = soup.find("div", attrs={ "class": "Tiles__TileBackground-fk0fs3-0 cSObNX" }).find("img")["src"] price = soup.find( "div", attrs={ "class": "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM" }).text # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul") # items = container.findAll("li", recursive=False) data.update({"image": image, "price": price}) except: pass return data # method to write csv file def writeCSVFile(self, data): try: with open( '/home/gc14/Documents/fiverr/custom_scrapers/home/trulia.csv', mode='w') as csv_file: fieldnames = ['Image', 'Price'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for d in data: writer.writerow({'Image': d['image'], 'Price': d['price']}) csv_file.close() print("File written successfully.") except: print(sys.exc_info()) pass # method to start process. def start(self): items = self.getItems() print("Items : ", len(items)) if items: self.writeCSVFile(items)
class DescargarPdf: def __init__(self): self.contadorCredenciales=0 self.tbb_dir = "/usr/local/share/tor-browser_en-US" self.usuario=[] self.contraseñaTxT=[] self.conversor='?convertedTo=pdf' def iniciarTor(self): self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log') def iniciarSecion(self): self.element=self.zLibraty.find_element_by_name("email") self.element.send_keys(self.correo) sleep(2) self.element2=self.zLibraty.find_elements_by_class_name("form-control")[1] self.element2.send_keys(self.contraseña) self.element2.send_keys(Keys.RETURN) def paginaDescargas(self): print("estoy en la funcion paginaDescagas") self.zLibraty.load_url(self.url) sleep(4) self.html=self.zLibraty.page_source def paginaPrinsipal(self,añoInicial,añoFinal): self.urlAños='http://zlibraryexau2g3p.onion/s/?yearFrom='+str(añoInicial)+'&yearTo='+str(añoFinal) self.url=self.urlAños def cambiarPagina(self,x): print("estoy en cambiar pagina prinsipal") self.url+='&page='+str(x) print(self.url) def Crearcsv(self): desde=datosDescarga(1) asta=datosDescarga(2) self.carpetaUrl='/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url' try : os.mkdir(self.carpetaUrl) except OSError as e: if e.errno != errno.EEXIST: raise self.escrivirUrlWed=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/url2.csv','w')) self.imprimirUrlPdf=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/urlDowload2.csv','w')) def credenciales(self,numeroUsuario): print("llegue") if self.contadorCredenciales==0 or self.contadorCredenciales==20: self.zLibraty.load_url("https://singlelogin.org/") self.zLibraty.find_element_by_name("redirectToHost").click() sleep(3) pyautogui.press("down") sleep(2) pyautogui.press("down") sleep(1) pyautogui.press("enter") sleep(5) self.correo=self.usuario[numeroUsuario] self.contraseña=self.contraseñaTxT[numeroUsuario] def UsuariosYcontraseñas(self): self.dir='/home/dd/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt' self.data=open(self.dir,'r+') for self.i in range(0,200): if self.i%2==0 : self.usuario.append(self.data.readline()) if self.i%2!=0: self.contraseñaTxT.append(self.data.readline()) def urlPdf(self,): self.contadorCredenciales=1 self.boleanoPdf=0 self.respaldoContador=0 self.contadorUsuarios=usuarioUsadosLeer() self.contadorLibros=datosDescarga(4) self.contadorLibros2=self.contadorLibros%10 self.Crearcsv() self.soup=BeautifulSoup(self.html,'html.parser') try: for self.urlwed in self.soup.find_all(itemprop = "name") : self.contador=0 self.urlwed=self.urlwed.find('a',href=re.compile('')) self.urlDowload=self.urlwed.get('href') self.urlpdfGeleneralH=re.sub('/book/','https://b-ok.cc/book/',self.urlDowload) self.urlDowload=re.sub('/book/','http://zlibraryexau2g3p.onion/book/',self.urlDowload) self.escrivirUrlWed.writerow([self.urlDowload]) print(self.urlDowload) self.voleano=validarFormato(self.urlpdfGeleneralH) guardarNumeroDescargas(self.contadorLibros) print(self.respaldoContador) if self.contadorLibros==self.respaldoContador: for self.urlRedirec in range(0,1): self.zLibraty.load_url(self.urlDowload) sleep(5) self.htmlPdf=self.zLibraty.page_source self.soupRedirec=BeautifulSoup(self.htmlPdf,'html.parser') self.urlDowloadPDF=self.soupRedirec.find(class_="btn btn-primary dlButton addDownloadedBook") self.urlDowloadPDF=self.urlDowloadPDF.get('href') self.urlDowloadPDF=re.sub('/dl/','http://zlibraryexau2g3p.onion/dl/',self.urlDowloadPDF) self.imprimirUrlPdf.writerow([self.urlDowloadPDF]) print(self.urlDowloadPDF) print("vamos a por el if") sleep(15) if self.voleano==True: self.zLibraty.set_page_load_timeout(12) try: self.zLibraty.load_url(self.urlDowloadPDF) except: sleep(5) self.zLibraty.set_page_load_timeout(7000) print("funciona PDF ") self.voleano=False sleep(5) self.contadorLibros+=1 self.contadorLibros2+=1 else: self.zLibraty.set_page_load_timeout(12) try: self.zLibraty.load_url(self.urlDowloadPDF) except: sleep(8) pyautogui.press("down") sleep(2) pyautogui.press("enter") self.zLibraty.set_page_load_timeout(7000) sleep(5) self.contadorLibros+=1 self.contadorLibros2+=1 self.zLibraty.load_url("about:downloads") self.datosEsperaDescarga() self.peticiones() self.zLibraty.back() informaiconPdf(self.urlpdfGeleneralH) guardarNumeroDescargas(self.contadorLibros) self.respaldoContador+=1 if self.contadorLibros==self.respaldoContador: if self.contadorLibros2%10==0: print((self.contadorLibros2-1)%10) self.contador+=1 if self.contadorLibros==20: self.contadorCredenciales=20 print("saliendo de secion¡¡¡¡¡¡") pyautogui.moveTo(1707,245) pyautogui.hotkey("ctrl","shift","u") sleep(2) pyautogui.press("enter") sleep(7) pyautogui.press("enter") sleep(15) else: print("saliendo de secion") self.zLibraty.get("http://zlibraryexau2g3p.onion/logout.php") self.contadorUsuarios+=1 print(self.contadorUsuarios) try: self.zLibraty.switch_to_window(self.zLibraty.window_handles[0]) except: print("error al cambian de ventana") usuarioUsadosReescrivir(self.contadorUsuarios) print("por aqui¿¿¿¿¿¿") self.credenciales(self.contadorUsuarios) self.contadorCredenciales=1 print("no por aqui¿¿¿¿¿¿") sleep(20) self.iniciarSecion() sleep(15) self.paginaDescargas() sleep(7) self.contadorLibros2=0 sleep(15) print("numero de li bros por usuario ",self.contadorLibros2) if self.contador==5: self.contador=0 except OSError as e : print(e.strerror) print("error en la urlPdf:::::") guardarNumeroDescargas(self.contadorLibros) usuarioUsadosReescrivir(self.contadorUsuarios) print(self.contadorLibros) archivos=int(contarNueroArchivos()) print(archivos) self.zLibraty.load_url("about:downloads") self.datosEsperaDescarga() self.peticiones() self.zLibraty.back() informaiconPdf(self.urlpdfGeleneralH) def DescargarContenido(self,_html): self.contenido=_html def serrarTor(self): self.zLibraty.close() def datosEsperaDescarga(self): sleep(4) self.htmlValidador=self.zLibraty.page_source def validarDescarga(self): self.htmlFalce=self.zLibraty.page_source self.soupFalce=BeautifulSoup(self.htmlFalce,"html.parser") self.validarfalce=self.soupFalce.find_all("description",class_="downloadDetails downloadDetailsNormal") self.respuestafalce=re.search("value=.+",str(self.validarfalce)) self.buscarFalse=self.respuestafalce.group() if re.search("Canceled",self.buscarFalse): print("se daño al descarga =(") sleep(5) pyautogui.click(1393,139) sleep(5) else : if re.search("Failed",self.buscarFalse): print("se daño al descarga pero vamos a solucionarlo =( ") sleep(5) pyautogui.click(1393,139) sleep(5) else: print("la descarga va bien =)") def peticiones(self): self.validarDescarga() self.carga=0 self.daño=0 self.conteo=0 while self.carga<100: self.soup=BeautifulSoup(self.htmlValidador,"html.parser") try: self.archivoDescarga=self.soup.find_all("progress",class_="downloadProgress") self.respaldo=re.split("value",str(self.archivoDescarga)) self.tiempo=re.search("[0-9]+",self.respaldo[1]) print(self.tiempo.group()) self.carga=int(self.tiempo.group()) self.datosEsperaDescarga() sleep(3) self.validarDescarga() if self.conteo==3: pyautogui.press("enter") self.conteo=0 except: print("o no ,se daño la descargar y no la e podido volver a iniciar") if self.daño==7: os.system('rm -r /home/dd/zlibros/libros1920-1921/libro/*.*') raise self.daño+=1 sleep(5)
def iniciarTor(self): self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log')
def visit_using_bridge(tbb_dir, bridge_type="meek-amazon"): url = "https://check.torproject.org" with TorBrowserDriver(tbb_dir, default_bridge_type=bridge_type) as driver: driver.load_url(url) print(driver.find_element_by("h1.on").text) # status text sleep(10) # To verify that the bridge is indeed uses, go to
def main(): with TorBrowserDriver(TBB_PATH) as driver: driver.get('https://check.torproject.org')
def loggin(ema, pas): try: browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) except: # selenium.common.exceptions.WebDriverException: Message: Access is denied. (os error 5) # mozilla is updating print('probably updating sleep 30') sleep(30) browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) # connect to site try: browser.load_url( "https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F", wait_on_page=5, wait_for_page_body=True) except: # selenium.common.exceptions.NoSuchWindowException: Message: Browsing context has been discarded try: browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) except: # selenium.common.exceptions.WebDriverException: Message: Access is denied. (os error 5) # mozilla is updating print('probably updating sleep 30') sleep(30) browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) browser.load_url( "https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F", wait_on_page=5, wait_for_page_body=True) # reg_el.click() # maximise browser.maximize_window() # Scroll browser.execute_script("window.scrollTo(0,100)") try: email_el = browser.find_element_by_id("email--1") except: sleep(10) try: email_el = browser.find_element_by_id("email--1") except: return False email_el.send_keys(ema) # enter password pass_el = browser.find_element_by_id("id_password") pass_el.send_keys(pas) # find submit link sub_el = browser.find_element_by_id('submit-id-submit') # click submit sub_el.click() sleep(2) # check try: avatar = browser.find_element_by_id('u711-popover-trigger--18') except: avatar = None if avatar: return browser elif 'udemy.com' in browser.current_url: return browser else: return None
import os import random from selenium import webdriver from collections import Counter from tbselenium.tbdriver import TorBrowserDriver from selenium.webdriver.support.ui import WebDriverWait import pickle na_file = open('/home/serj/python/tor_ip', 'wb') array = [] i = 0 url = str(input('Введите адрес страницы для DDoS атаки: ')) q = int(input('Введите количество проходов: ')) time = int(input('Введите время задержки на странице: ')) while i < q: with TorBrowserDriver("/home/serj/selenium/tor-browser_en-US/") as driver: driver.get('https://2ip.ru/') ipconf = driver.find_element_by_id('d_clip_button') ip = ipconf.text realtime = str(time + random.randint(1, 15)) array.append(ip) driver.get(url) realtime = str(time + random.randint(1, 15)) driver.implicitly_wait(realtime) link1 = driver.find_elements_by_id('adContent-clickOverlay') i += 1 print('Выполнение ' + str(i / q * 100) + ' % ' + ' Задержкасна странице - ' + str(realtime) + 'сек.') driver.close() abc = Counter(array)
def up(name, ema, pas): browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) # connect to site browser.load_url( "https://www.udemy.com/join/signup-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F", wait_on_page=5, wait_for_page_body=True) # find link button #reg_el = browser.find_element_by_link_text("Sign up") # https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F # click # reg_el.click() # enter full name full_name = browser.find_element_by_id("id_fullname") full_name.send_keys(name) # enter email email_el = browser.find_element_by_id("email--1") email_el.send_keys(ema) # enter password pass_el = browser.find_element_by_id("password") pass_el.send_keys(pas) # Scroll browser.execute_script("window.scrollBy(0,200)") browser.execute_script( 'document.getElementById("id_subscribe_to_emails").checked = false') # find submit link sub_el = browser.find_element_by_id('submit-id-submit') # click submit sub_el.click() sleep(1) # check if 'occupation' in browser.current_url: # find submit link sleep(3) try: browser.execute_script( 'document.getElementsByClassName("ot-sdk-container").sytle.display = "none"' ) except: pass cl = browser.find_elements_by_class_name("udlite-btn") try: cl[0].click() except: browser.execute_script( 'document.getElementsByClassName("ot-sdk-container").sytle.display = "none"' ) cl[0].click() sleep(3) browser.close() return True if '=1' in browser.current_url: browser.close() return True
class DescargarPdf: def __init__(self): self.tbb_dir = "/usr/local/share/tor-browser_en-US" self.usuario = [] self.contraseñaTxT = [] self.conversor = '?convertedTo=pdf' def iniciarTor(self): self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log') def iniciarSecion(self): self.element = self.zLibraty.find_element_by_name("email") self.element.send_keys(self.correo) sleep(2) self.element2 = self.zLibraty.find_elements_by_class_name( "form-control")[1] self.element2.send_keys(self.contraseña) self.element2.send_keys(Keys.RETURN) def paginaDescargas(self): self.zLibraty.load_url(self.url) self.html = self.zLibraty.page_source def paginaPrinsipal(self, añoInicial, añoFinal): self.urlAños = 'http://zlibraryexau2g3p.onion/s/?yearFrom=' + str( añoInicial) + '&yearTo=' + str(añoFinal) self.url = self.urlAños def cambiarPagina(self, x): self.url += '&page=' + str(x) def Crearcsv(self): print("hola") self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url' try: os.mkdir(self.carpetaUrl) except OSError as e: if e.errno != errno.EEXIST: raise self.escrivirUrlWed = csv.writer( open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv', 'w')) self.imprimirUrlPdf = csv.writer( open( '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv', 'w')) def credenciales(self, numeroUsuario): self.correo = self.usuario[numeroUsuario] self.contraseña = self.contraseñaTxT[numeroUsuario] self.urlLoguin = 'http://zlibraryexau2g3p.onion' self.zLibraty.load_url(self.urlLoguin) def UsuariosYcontraseñas(self): self.dir = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/descargarLIbros/descargarparte1/contraseñasYcorreos.txt' self.data = open(self.dir, 'r+') for self.i in range(0, 200): if self.i % 2 == 0: self.usuario.append(self.data.readline()) if self.i % 2 != 0: self.contraseñaTxT.append(self.data.readline()) def urlPdf(self, contador, _contadorusuarios): self.boleanoPdf = 0 self.contadorUsuariosCon = _contadorusuarios self.contadorLibros2 = 0 self.contadorLibros = 0 self.Crearcsv() self.soup = BeautifulSoup(self.html, 'html.parser') for self.urlwed in self.soup.find_all(itemprop="name"): self.contador = 0 self.urlwed = self.urlwed.find('a', href=re.compile('')) self.urlDowload = self.urlwed.get('href') self.urlpdfGeleneralH = re.sub('/book/', 'https://b-ok.cc/book/', self.urlDowload) self.urlDowload = re.sub('/book/', 'http://zlibraryexau2g3p.onion/book/', self.urlDowload) self.escrivirUrlWed.writerow([self.urlDowload]) print(self.urlDowload) self.contadorLibros += 1 self.contadorLibros2 += 1 if self.contadorLibros2 == 10: self.contador += 1 self.serrarTor() sleep(4) self.iniciarTor() self.contadorUsuariosCon += 1 print(self.contadorUsuariosCon) self.credenciales(contadorusuarios) self.iniciarSecion() sleep(7) self.contadorLibros2 = 0 sleep(15) if self.contador == 5: self.contador = 0 voleano = validarFormato(self.urlpdfGeleneralH) for self.urlRedirec in range(0, 1): self.zLibraty.load_url(self.urlDowload) sleep(5) self.htmlPdf = self.zLibraty.page_source self.soupRedirec = BeautifulSoup(self.htmlPdf, 'html.parser') self.urlDowloadPDF = self.soupRedirec.find( class_="btn btn-primary dlButton addDownloadedBook") self.urlDowloadPDF = self.urlDowloadPDF.get('href') self.urlDowloadPDF = re.sub( '/dl/', 'http://zlibraryexau2g3p.onion/book/dl/', self.urlDowloadPDF) self.imprimirUrlPdf.writerow([self.urlDowloadPDF]) print(self.urlDowloadPDF) if voleano == True: self.zLibraty.get(self.urlDowloadPDF) voleano = False else: self.convertirpdf = str(self.urlDowloadPDF) + str( self.conversor) self.zLibraty.get(self.convertirpdf) sleep(50) informaiconPDf(self.urlDowload) def DescargarContenido(self, _html): self.contenido = _html def serrarTor(self): self.zLibraty.close()
class Visit(object): """Hold info about a particular visit to a page.""" def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None, experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True): self.batch_num = batch_num self.site_num = site_num self.instance_num = instance_num self.page_url = page_url self.bg_site = bg_site self.experiment = experiment self.base_dir = base_dir self.visit_dir = None self.visit_log_dir = None self.tbb_version = cm.RECOMMENDED_TBB_VERSION self.capture_screen = capture_screen self.tor_controller = tor_controller self.xvfb = xvfb self.init_visit_dir() self.pcap_path = os.path.join( self.visit_dir, "{}.pcap".format(self.get_instance_name())) if self.xvfb and not cm.running_in_CI: wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H)) self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H) self.vdisplay.start() # Create new instance of TorBrowser driver TorBrowserDriver.add_exception(self.page_url) self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH, tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log")) self.sniffer = Sniffer() # sniffer to capture the network traffic def init_visit_dir(self): """Create results and logs directories for this visit.""" visit_name = str(self.instance_num) self.visit_dir = os.path.join(self.base_dir, visit_name) ut.create_dir(self.visit_dir) self.visit_log_dir = os.path.join(self.visit_dir, 'logs') ut.create_dir(self.visit_log_dir) def get_instance_name(self): """Construct and return a filename for the instance.""" inst_file_name = '{}_{}_{}' \ .format(self.batch_num, self.site_num, self.instance_num) return inst_file_name def filter_guards_from_pcap(self): guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()]) wl_log.debug("Found %s guards in the concensus.", len(guard_ips)) orig_pcap = self.pcap_path + ".original" copyfile(self.pcap_path, orig_pcap) try: preader = PcapReader(orig_pcap) pcap_filtered = [] for p in preader: if IP not in p: pcap_filtered.append(p) continue ip = p.payload if ip.dst in guard_ips or ip.src in guard_ips: pcap_filtered.append(p) wrpcap(self.pcap_path, pcap_filtered) except Exception as e: wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s", e, orig_pcap) else: os.remove(orig_pcap) def post_crawl(self): pass # TODO: add some sanity checks? def cleanup_visit(self): """Kill sniffer and Tor browser if they're running.""" wl_log.info("Cleaning up visit.") wl_log.info("Cancelling timeout") ut.cancel_timeout() if self.sniffer and self.sniffer.is_recording: wl_log.info("Stopping sniffer...") self.sniffer.stop_capture() # remove non-tor traffic self.filter_guards_from_pcap() if self.tb_driver and self.tb_driver.is_running: # shutil.rmtree(self.tb_driver.prof_dir_path) wl_log.info("Quitting selenium driver...") self.tb_driver.quit() # close all open streams to prevent pollution self.tor_controller.close_all_streams() if self.xvfb and not cm.running_in_CI: wl_log.info("Stopping display...") self.vdisplay.stop() # after closing driver and stoping sniffer, we run postcrawl self.post_crawl() def take_screenshot(self): try: out_png = os.path.join(self.visit_dir, 'screenshot.png') wl_log.info("Taking screenshot of %s to %s" % (self.page_url, out_png)) self.tb_driver.get_screenshot_as_file(out_png) if cm.running_in_CI: wl_log.debug("Screenshot data:image/png;base64,%s" % self.tb_driver.get_screenshot_as_base64()) except: wl_log.info("Exception while taking screenshot of: %s" % self.page_url) def get_wang_and_goldberg(self): """Visit the site according to Wang and Goldberg (WPES'13) settings.""" ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to stop the visit self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {}".format(self.page_url)) t1 = time.time() self.tb_driver.get(self.page_url) page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit() def get_multitab(self): """Open two tab, use one to load a background site and the other to load the real site.""" PAUSE_BETWEEN_TAB_OPENINGS = 0.5 ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to kill running procs # load a blank page - a page is needed to send keys to the browser self.tb_driver.get(BAREBONE_HOME_PAGE) self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {} with {} in the background". format(self.page_url, self.bg_site)) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab # now that the focus is on the address bar, load the background # site by "typing" it to the address bar and "pressing" ENTER (\n) # simulated by send_keys function body.send_keys('%s\n' % self.bg_site) # the delay between the loading of background and real sites time.sleep(PAUSE_BETWEEN_TAB_OPENINGS) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab t1 = time.time() self.tb_driver.get(self.page_url) # load the real site in the 2nd tab page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit() def get(self): """Call the specific visit function depending on the experiment.""" if self.experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG: self.get_wang_and_goldberg() elif self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA: self.get_multitab() else: raise ValueError("Cannot determine experiment type")
def makeRequest(url, domain): """ Makes HTTP request to url given as argument, after changing IP. """ import time # Opening log file f = open(logfile_name, 'a') print('Changing IP...\n') # Below is method A using requests library without opening real TOR browser. # Method B will be used instead, which opens a real browser, so that JS code is executed # and Google Analytics tracks us as a real user. """ # Resetting IP tr.reset_identity() # This command changes restarts tor service, resulting in IP address change. After '-p' flag insert user password. #os.system('sudo systemctl restart tor -p 0000') #Creating empty session object session = requests.session() session.proxies = {} # Adding proxies to session session.proxies['http'] = 'socks5h://localhost:9050' session.proxies['https'] = 'socks5h://localhost:9050' #Changing request headers headers = {} headers['User-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' print('Request headers were set.\n') - new_ip = session.get('http://ipecho.net/plain').text # Executing requests #Executing request and assigning response status code status_code = session.get(url).status_code """ # Method B, using complete TOR Browser driver = TorBrowserDriver("/home/manos/Desktop/tor-browser_en-US") # driver.get('https://ipecho.net/plain') # new_ip = driver.find_element_by_tag_name('body').text checkConn() driver.get(url) time.sleep(2.0) driver.close() # Request logging time = 'Date: ' + str(datetime.datetime.now())[0:10] + '\nTime: ' + str( datetime.datetime.now())[11:19] f.write( time + '\nDomain: ' + domain + '\n' 'Request sent to ' + url + '.' + '\nResponse status code: ' + str(200) + '\n*******************************************************************************************\n\n' ) f.close() os.system('clear')
from tbselenium.tbdriver import TorBrowserDriver import proxy import random ### Driver setup driver = TorBrowserDriver("/home/octavcosmin/Files/tor-browser_en-US") # while 1: # driver.load_url('https://check.torproject.org', wait_for_page_body=True) # time.sleep(3) # proxy.fresh_identity() # driver.refresh() def vote(): driver.load_url('https://docs.google.com/forms/d/e/1FAIpQLSfw-iWlyH0Zk2qKnKo-nLD3tdMxS_-s2hk6RqEEf7Hs-7BWqg/viewform', wait_for_page_body=True) check_boxes = driver.find_elements_by_class_name("exportInnerBox") ## Genul (0-1) # 0 - Masculin # 1 - Feminin ## Clasa (2-9) # 2 - a V-a # 3 - a VI-a # 4 - a VII-a # 5 - a VIII-a # 6 - a IX-a # 7 - a X-a # 8 - a XI-a # 9 - a XII-a ## Dificultati # 10 - Am o afecțiune cronică ce creşte riscul de complicaţii în cazul infecţiei cu COVID-19 # 11 - Locuiesc cu persoane în vârstă/ cu afecţiuni cronice, vulnerabile la infecţia cu COVID-19
if setting == "PREPARE_URLS": # Determine how many pages are loaded before restarting the browser if runidentifier.startswith('wsc'): listsize_urls = 1 else: listsize_urls = 5 # Prepare files with subsets of URLs prepare_url_files(listsize_urls, urlfile) else: if tmp1.isdigit(): if int(tmp1) > 0: timeout = int(tmp1) else: exit_with_help('Error: Timeout is a Negative Number!') else: exit_with_help('Error: Timeout is is not a Number!') if hostname == None: exit_with_help('Error: Hostname is is not defined!') # Read URLs urls = parse_url_list(urlfile) urlfilename = urlfile.split('/')[-1] with TorBrowserDriver(tbb_dir) as driver: driver.load_url_improved(urls, runidentifier, hostname, urlfile, timeout)
def test_should_raise_for_missing_tbb_path(self): with self.assertRaises(TBDriverPathError) as exc: TorBrowserDriver(tbb_path=MISSING_DIR) self.assertEqual(str(exc.exception), "TBB path is not a directory %s" % MISSING_DIR)
def __init__(self): self.url = 'https://www.trulia.com' # need to set Tor Browser path here. tbpath = "/home/gc14/Documents/softwares/tor-browser_en-US" self.driver = TorBrowserDriver(tbb_path=tbpath, tbb_logfile_path='test.log')
def test_should_fail_launching_tbb_tor_on_custom_socks_port(self): with self.assertRaises(TBDriverPortError): TorBrowserDriver(TBB_PATH, socks_port=free_port(), tor_cfg=cm.LAUNCH_NEW_TBB_TOR)
# PyPi: https://pypi.org/project/tbselenium/ # Github: https://github.com/webfp/tor-browser-selenium # pip install tbselenium # Install geckodriver from the geckodriver releases page. Make sure you install version v0.23.0 version or newer; # older versions may not be compatible with the current Tor Browser series. # Basic usage: # Using with system tor # tor needs to be installed (apt install tor) and running on port 9050. from tbselenium.tbdriver import TorBrowserDriver with TorBrowserDriver("/path/to/TorBrowserBundle/") as driver: driver.get('https://check.torproject.org') # Using with Stem # First, make sure you have Stem installed (pip install stem). The following will start a new tor process using Stem. # It will not use the tor installed on your system. import tbselenium.common as cm from tbselenium.tbdriver import TorBrowserDriver from tbselenium.utils import launch_tbb_tor_with_stem tbb_dir = "/path/to/TorBrowserBundle/" tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir) with TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) as driver: driver.load_url("https://check.torproject.org")
class Crawler: """Crawls your onions, but also manages Tor, drives Tor Browser, and uses information from your Tor cell log and stem to collect cell sequences.""" def __init__(self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt","tbb","tor-browser_en-US"), tb_log_path=join(_log_dir,"firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") # Set stem logging level to INFO - "high level library activity" stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO) self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config(config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data) def authenticate_to_tor_controlport(self): self.logger.info("Authenticating to the tor controlport...") try: self.controller = Controller.from_port(port=self.control_port) except stem.SocketError as exc: panic("Unable to connect to tor on port {self.control_port}: " "{exc}".format(**locals())) try: self.controller.authenticate() except stem.connection.MissingPassword: panic("Unable to authenticate to tor controlport. Please add " "`CookieAuth 1` to your tor configuration file.") def get_control_data(self, page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields): """Gather metadata about the crawler instance.""" control_data = {} # Configuration settings control_data["page_load_timeout"] = page_load_timeout control_data["wait_on_page"] = wait_on_page control_data["wait_after_closing_circuits"] = \ wait_after_closing_circuits if additional_control_fields: control_data.update(additional_control_fields) # System facts control_data["kernel"] = platform.system() control_data["kernel_version"] = platform.release() control_data["os"] = platform.version() control_data["python_version"] = platform.python_version() ip = urlopen("https://api.ipify.org").read().decode() control_data["ip"] = ip # This API seems to be unstable and we haven't found a suitable # alternative :( try: asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip)) asn_geoip = literal_eval(asn_geoip.read().decode()) control_data["asn"] = asn_geoip.get("ip").get("as").get("asn") control_data["city"] = asn_geoip.get("ip").get("city") control_data["country"] = asn_geoip.get("ip").get("country") except urllib.error.HTTPError: self.logger.warning("Unable to query ASN API and thus some " "control data may be missing from this run.") control_data["tor_version"] = self.controller.get_version().version_str control_data["tb_version"] = self.tb_driver.tb_version # Tor will have multiple entry nodes in its state file, but will # choose the first sequential one that is up as its entry guard. entry_nodes = self.controller.get_info("entry-guards").split('\n') control_data["entry_node"] = next(re.search("[0-9A-F]{40}", g).group(0) for g in entry_nodes if re.search("up", g)) control_data["crawler_version"] = _version return control_data def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() return self def __del__(self): self.close() def close(self): self.logger.info("Beginning Crawler exit process...") if "tb_driver" in dir(self): self.logger.info("Closing Tor Browser...") self.tb_driver.quit() if "virtual_framebuffer" in dir(self): self.logger.info("Closing the virtual framebuffer...") # A bug in pyvirtualdisplay triggers a KeyError exception when closing a # virtual framebuffer if the $DISPLAY environment variable is not set. try: stop_xvfb(self.virtual_framebuffer) except KeyError: pass if "cell_log" in dir(self): self.logger.info("Closing the Tor cell stream...") self.cell_log.close() if "tor_process" in dir(self): self.logger.info("Killing the tor process...") self.tor_process.kill() self.logger.info("Crawler exit completed.") def collect_onion_trace(self, url, hsid=None, extra_fn=None, trace_dir=None, iteration=0): """Crawl an onion service and collect a complete cell sequence for the activity at the time. Also, record additional information about the circuits with stem. Optionally, pass a function to execute additional actions after the page has loaded.""" # Todo: create collect_trace method that works for regular sites as # well assert ".onion" in url, ("This method is only suitable for crawling " "onion services.") self.logger.info("{url}: closing existing circuits before starting " "crawl.".format(**locals())) for circuit in self.controller.get_circuits(): self.controller.close_circuit(circuit.id) sleep(self.wait_after_closing_circuits) if not trace_dir: trace_dir = self.make_ts_dir() trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration) trace_path = join(trace_dir, trace_name) start_idx = self.get_cell_log_pos() try: self.crawl_url(url) rend_circ_ids = self.get_rend_circ_ids(url) if extra_fn: self.execute_extra_fn(url, trace_path, start_idx) except CrawlerLoggedError: return "failed" except CrawlerNoRendCircError: self.save_debug_log(url, trace_path, start_idx) return "failed" except: self.logger.exception("{url}: unusual exception " "encountered:".format(**locals())) # Also log active circuit info self.controller.get_circuits() exc_type, exc_value, exc_traceback = exc_info() if exc_type in _sketchy_exceptions: self.save_debug_log(url, trace_path, start_idx) if self.restart_on_sketchy_exception: self.restart_tb() return "failed" self.logger.info("{url}: saving full trace...".format(**locals())) end_idx = self.get_cell_log_pos() full_trace = self.get_full_trace(start_idx, end_idx) # Save the trace to the database or write to file if self.db_handler: try: new_example = {'hsid': hsid, 'crawlid': self.crawlid, 't_scrape': get_timestamp("db")} except NameError: panic("If using the database, and calling collect_onion_trace " "directly, you must specify the hsid of the site.") exampleid = self.db_handler.add_example(new_example) self.db_handler.add_trace(str(full_trace), exampleid) else: with open(trace_path+"-full", "wb") as fh: fh.write(full_trace) return "succeeded" def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"): """Creates a timestamped folder to hold a group of traces.""" raw_dirpath = join(parent_dir, raw_dir_name) ts = get_timestamp("log") ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True) symlink_cur_to_latest(raw_dirpath, ts) with open(join(ts_dir, "control.pickle"), "wb") as fh: pickle.dump(self.control_data, fh) return ts_dir def get_cell_log_pos(self): """Returns the current position of the last byte in the Tor cell log.""" return self.cell_log.seek(0, SEEK_END) def crawl_url(self, url): """Load a web page in Tor Browser and optionally pass a function to execute custom actions on it.""" self.logger.info("{url}: starting page load...".format(**locals())) try: self.tb_driver.load_url(url, wait_on_page=self.wait_on_page, wait_for_page_body=True) except TimeoutException: self.logger.warning("{url}: timed out.".format(**locals())) raise CrawlerLoggedError except http.client.CannotSendRequest: self.logger.warning("{url}: cannot send request--improper " "connection state.".format(**locals())) raise CrawlerLoggedError # Make sure we haven't just hit an error page or nothing loaded try: if (self.tb_driver.is_connection_error_page or self.tb_driver.current_url == "about:newtab"): raise CrawlerReachedErrorPage except CrawlerReachedErrorPage: self.logger.warning("{url}: reached connection error " "page.".format(**locals())) raise CrawlerLoggedError self.logger.info("{url}: successfully loaded.".format(**locals())) def get_rend_circ_ids(self, url): """Returns the rendezvous circuit id(s) associated with a given onion service.""" self.logger.info("{url}: collecting circuit " "information...".format(**locals())) active_circs = self.controller.get_circuits() rend_circ_ids = set() for circ in active_circs: if (circ.purpose == "HS_CLIENT_REND" and circ.socks_username and circ.socks_username in url): rend_circ_ids.add(circ.id) # If everything goes perfect, we should only see one. Multiple indicate # the first failed. Zero indicates one closed abruptly (or there's an # error with stem--still waiting on data to confirm or deny). rend_circ_ct = len(rend_circ_ids) self.logger.info("{url}: {rend_circ_ct} associated rendezvous circuits " "discovered.".format(**locals())) if rend_circ_ct == 0: raise CrawlerNoRendCircError return rend_circ_ids def execute_extra_fn(self, url, trace_path, start_idx): self.logger.info("{url}: executing extra function " "code...".format(**locals())) extra_fn(self, url, trace_path, start_idx) self.logger.info("{url}: extra function executed " "successfully.".format(**locals())) def save_debug_log(self, url, trace_path, start_idx): self.logger.warning("{url}: saving debug log...".format(**locals())) exc_time = self.get_cell_log_pos() trace = self.get_full_trace(start_idx, exc_time) with open(trace_path + "@debug", "wb") as fh: fh.write(trace) def get_full_trace(self, start_idx, end_idx): """Returns the Tor DATA cells transmitted over a circuit during a specified time period.""" # Sanity check assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile " "position") assert end_idx > start_idx, ("logfile section end_idx must come " "after start_idx") self.cell_log.seek(start_idx, SEEK_SET) return self.cell_log.read(end_idx - start_idx) def restart_tb(self): """Restarts the Tor Browser.""" self.logger.info("Restarting the Tor Browser...") self.tb_driver.quit() self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=USE_RUNNING_TOR, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.logger.info("Tor Browser restarted...") def collect_set_of_traces(self, url_set, extra_fn=None, trace_dir=None, iteration=0, shuffle=True, retry=True, url_to_id_mapping=None): """Collect a set of traces.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = url_set trace_dir = None elif not trace_dir: trace_dir = self.make_ts_dir() set_size = len(url_set) self.logger.info("Saving set of {set_size} traces to " "{trace_dir}.".format(**locals())) # Converts both sets (from pickle files) and dicts (whose keys are # URLs--from database) to URL lists url_set = list(url_set) if shuffle: random.shuffle(url_set) failed_urls = [] for url_idx in range(set_size): self.logger.info("Collecting trace {} of " "{set_size}...".format(url_idx+1, **locals())) url = url_set[url_idx] if self.db_handler: hsid = url_to_id_mapping[url] else: hsid = None if (self.collect_onion_trace(url, hsid=hsid, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration) == "failed" and retry): failed_urls.append(url) if failed_urls: failed_ct = len(failed_urls) self.logger.info("Retrying {failed_ct} of {set_size} traces that " "failed.".format(**locals())) self.collect_set_of_traces(failed_urls, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration, shuffle=shuffle, retry=False, url_to_id_mapping=url_to_id_mapping) def crawl_monitored_nonmonitored(self, monitored_class, nonmonitored_class, extra_fn=None, shuffle=True, retry=True, monitored_name="monitored", nonmonitored_name="nonmonitored", url_to_id_mapping=None, ratio=40): """Crawl a monitored class ratio times interspersed between the crawling of a(n ostensibly larger) non-monitored class.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = nonmonitored_class url_to_id_mapping.update(monitored_class) trace_dir, mon_trace_dir, nonmon_trace_dir = (None,) * 3 else: trace_dir = self.make_ts_dir() mon_trace_dir = join(trace_dir, monitored_name) mkdir(mon_trace_dir) nonmon_trace_dir = join(trace_dir, nonmonitored_name) mkdir(nonmon_trace_dir) # db: calling list on a dict returns a list of its keys (URLs) # pickle: calling list on set is necessary to make it shuffleable nonmonitored_class = list(nonmonitored_class) monitored_class = list(monitored_class) nonmonitored_class_ct = len(nonmonitored_class) chunk_size = int(nonmonitored_class_ct / ratio) if shuffle: random.shuffle(nonmonitored_class) random.shuffle(monitored_class) for iteration in range(ratio): self.logger.info("Beginning iteration {i} of {ratio} in the " "{monitored_name} class".format(i=iteration+1, **locals())) self.collect_set_of_traces(monitored_class, trace_dir=mon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping) slice_lb = iteration * chunk_size slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct) self.logger.info("Crawling services {} through {slice_ub} of " "{nonmonitored_class_ct} in the " "{nonmonitored_name} " "class".format(slice_lb + 1, **locals())) self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub], trace_dir=nonmon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping)
import tbselenium.common as cm from tbselenium.tbdriver import TorBrowserDriver from tbselenium.utils import launch_tbb_tor_with_stem tbb_dir = "/home/andrew/Desktop/tor-browser-linux64-8.0.8_en-US/tor-browser_en-US/" tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir) with TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) as driver: driver.load_url("https://check.torproject.org") print("Done") tor_process.kill() # from tbselenium.tbdriver import TorBrowserDriver # with TorBrowserDriver("/home/andrew/Desktop/tor-browser-linux64-8.0.8_en-US/tor-browser_en-US/") as driver: # driver.get('https://check.torproject.org')