Esempio n. 1
0
def create_tor_webdriver() -> TorBrowserDriver:
    """
    Create Selenium Tor Web driver and load test page.
    :return: TorBrowserDriver
    """
    _add_geckodriver_to_path()

    try:
        cur_path = os.path.abspath(os.curdir)
        driver = TorBrowserDriver('tor_browser_folder')
        driver.load_url('https://check.torproject.org',
                        wait_for_page_body=True)
        os.chdir(cur_path)
        return driver
    except TBDriverPortError:
        print(
            'Probably you need to install Tor Service\non linux try this:\nsudo apt-get install tor\n'
        )
        raise TBDriverPortError
    except WebDriverException:
        print(
            'Probably you have uncompatible geckodriver\n'
            'then visit:\nhttps://github.com/mozilla/geckodriver/releases\n'
            'But, may be, you just have no geckodriver in you PATH then just add it there :)\n'
        )
        raise WebDriverException
Esempio n. 2
0
class crearCorros:
    def __init__(self):
        self.urlProtocoe ='http://3g2upl4pq6kufc4m.onion','https://mail.protonmail.com/create/new','https://singlelogin.org/registration.php'
        print(self.urlProtocoe[2])
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.protocoe = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log')
        self.dirNombre='/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/nombre.txt'
        self.nombre=open(self.dirNombre,'r+')
        self.dirapellido='/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/apellidos.txt'
        self.apellido=open(self.dirapellido,'r+')
        self.dirContrasenna='/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/contraseña.txt'
        self.contrasenna=open(self.dirContrasenna,'r+')
        self.dirCotrasenna2='/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/contraseña2.txt'
        self.Contrasenna2=open(self.dirCotrasenna2,'r+')
        self.datosContrasenna=[]
        self.lista=[]
        for self.d in range(0,101):
            self.lista.append(self.nombre.readline()+'asdsdf')
            self.datosContrasenna.append(self.contrasenna.readline()+"blabal")
        for self.d in range(0,100):
            self.lista[self.d]=re.sub('\n','asdaawderca',self.lista[self.d])
            self.datosContrasenna[self.d]=re.sub('\n','radabanals',self.datosContrasenna[self.d])
            self.lista[self.d]=re.sub(
            r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
            normalize( "NFD",self.lista[self.d]), 0, re.I
            )
            self.lista[self.d]= normalize( 'NFC',self.lista[self.d])
            self.datosContrasenna[self.d]=re.sub(
            r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
            normalize( "NFD",self.datosContrasenna[self.d]), 0, re.I
             )
            self.datosContrasenna[self.d]= normalize( 'NFC',self.datosContrasenna[self.d])
            self.lista[self.d]+='@maildrop.cc'
    def iniciarTor(self):
        self.protocoe.load_url(self.urlProtocoe[2])
    def ingresarDatos(self,fila):
        self.eamil=self.protocoe.find_element_by_name('email')
        self.eamil.click()
        sleep(random.uniform(1.0,4))
        self.eamil.send_keys(self.lista[fila])
        self.pasword=self.protocoe.find_element_by_name("password")
        self.pasword.click()
        sleep(random.uniform(1.0,5))
        self.pasword.send_keys(self.datosContrasenna[fila])
        self.name=self.protocoe.find_element_by_name("name")
        sleep(random.uniform(1.0,6))
        self.name.click()
        self.name.send_keys(self.datosContrasenna[fila])
        sleep(random.uniform(2.0,8.7))
        self.name.send_keys(Keys.RETURN)
    def serrarTor(self):
        self.protocoe.close() 
    def imprimirDatos(self):
        #self.dirscv='/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/emailFalsos/contraseñasYcorreos.csv'
        #self.datos=csv.writer(open(self.dirscv,'w'))
        for d in range(0,100):
            #self.datos.writerow([self.lista[d]])
            #self.datos.writerow([self.datosContrasenna[d]])
            print(self.lista[d])
            print(self.datosContrasenna[d])
Esempio n. 3
0
class DescargarPdf:
    def __init__(self):
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"

    def iniciarTor(self):
        self.zLibraty = TorBrowserDriver(self.tbb_dir,
                                         tbb_logfile_path='test.log')

    def iniciarSecion(self):
        self.element = self.zLibraty.find_element_by_name("email")
        self.element.send_keys(self.correo)
        sleep(2)
        self.element2 = self.zLibraty.find_elements_by_class_name(
            "form-control")[1]
        self.element2.send_keys(self.contraseña)
        self.element2.send_keys(Keys.RETURN)

    def paginaDescargas(self):
        self.zLibraty.load_url(self.url)
        self.html = self.zLibraty.page_source

    def paginaPrinsipal(self, añoInicial, añoFinal):
        self.urlAños = 'https://b-ok.cc/s/?yearFrom=' + str(
            añoInicial) + '&yearTo=' + str(añoFinal)
        self.url = self.urlAños

    def Crearcsv(self):
        print("hola")
        self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url'
        try:
            os.mkdir(self.carpetaUrl)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        self.escrivirUrlWed = csv.writer(
            open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv',
                 'w'))
        self.imprimirUrlPdf = csv.writer(
            open(
                '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv',
                'w'))

    def credenciales(self):
        self.correo = '*****@*****.**'
        self.contraseña = 'ddggcc77'
        self.urlLoguin = 'https://singlelogin.org/?logoutAll'
        self.zLibraty.load_url(self.urlLoguin)

    def urlPdf(self):
        self.Crearcsv()
        self.soup = BeautifulSoup(self.html, 'html.parser')
        for self.urlwed in self.soup.find_all(itemprop="name"):
            self.urlwed = self.urlwed.find('a', href=re.compile(''))
            self.urlDowload = self.urlwed.get('href')
            self.urlDowload = re.sub('/book/', 'https://b-ok.cc/book/',
                                     self.urlDowload)
            self.escrivirUrlWed.writerow([self.urlDowload])
            print(self.urlDowload)
Esempio n. 4
0
def up(name, ema, pas):

    browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
    # connect to site
    browser.load_url(
        "https://www.udemy.com/join/signup-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F",
        wait_on_page=5,
        wait_for_page_body=True)
    # find link button
    #reg_el = browser.find_element_by_link_text("Sign up")
    # https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F
    # click
    # reg_el.click()
    # enter full name
    full_name = browser.find_element_by_id("id_fullname")
    full_name.send_keys(name)
    # enter email
    email_el = browser.find_element_by_id("email--1")
    email_el.send_keys(ema)
    # enter password
    pass_el = browser.find_element_by_id("password")
    pass_el.send_keys(pas)
    # Scroll
    browser.execute_script("window.scrollBy(0,200)")
    browser.execute_script(
        'document.getElementById("id_subscribe_to_emails").checked = false')

    # find submit link
    sub_el = browser.find_element_by_id('submit-id-submit')
    # click submit
    sub_el.click()
    sleep(1)
    # check
    if 'occupation' in browser.current_url:
        # find submit link
        sleep(3)
        try:
            browser.execute_script(
                'document.getElementsByClassName("ot-sdk-container").sytle.display = "none"'
            )
        except:
            pass
        cl = browser.find_elements_by_class_name("udlite-btn")
        try:
            cl[0].click()
        except:
            browser.execute_script(
                'document.getElementsByClassName("ot-sdk-container").sytle.display = "none"'
            )
            cl[0].click()

        sleep(3)
        browser.close()
        return True
    if '=1' in browser.current_url:
        browser.close()
        return True
Esempio n. 5
0
class TestSite(unittest.TestCase):
    def setUp(self):
        # Point the path to the tor-browser_en-US directory in your system
        tbpath = '/home/kdas/.local/tbb/tor-browser_en-US/'
        self.driver = TorBrowserDriver(tbpath, tbb_logfile_path='test.log')
        self.url = "https://check.torproject.org"

    def tearDown(self):
        # We want the browser to close at the end of each test.
        self.driver.close()

    def test_available(self):
        self.driver.load_url(self.url)
        # Find the element for success
        element = self.driver.find_element_by_class_name('on')
        self.assertEqual(
            str.strip(element.text),
            "Congratulations. This browser is configured to use Tor.")
        sleep(2)  # So that we can see the page
Esempio n. 6
0
class validarCuentas:
    def __init__(self):
        self.dirNombre = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/nombre.txt'
        self.nombre = open(self.dirNombre, 'r+')
        self.email = []
        for self.d in range(0, 101):
            self.email.append(self.nombre.readline() + 'asdsdf')
        for self.d in range(0, 100):
            self.email[self.d] = re.sub('\n', 'asdaawderca',
                                        self.email[self.d])
            self.email[self.d] = re.sub(
                r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+",
                r"\1", normalize("NFD", self.email[self.d]), 0, re.I)
            self.email[self.d] = normalize('NFC', self.email[self.d])

    def iniciarTor(self, fila):
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.mailpro = TorBrowserDriver(self.tbb_dir,
                                        tbb_logfile_path='test.log')
        self.mailpro.load_url('https://maildrop.cc/')

    def ingresarDatos(self, fila):
        self.pulsar = self.mailpro.find_elements_by_xpath('//input')[1]
        self.pulsar.send_keys(self.email[fila])
        self.pulsar.send_keys(Keys.RETURN)
        sleep(6)
        self.correo = self.mailpro.find_elements_by_xpath('//div[@class]')[14]
        self.correo.click()
        sleep(5)
        self.iframe = self.mailpro.find_element_by_tag_name('iframe')
        self.mailpro.switch_to.frame(self.iframe)
        print(self.mailpro.page_source)
        self.mailpro.find_elements_by_xpath('//a[@href]')[1].click()

    def serrarTor(self):
        self.mailpro.close()

    def imprimirDatos(self):
        for d in range(0, 100):
            print(self.email[d])
Esempio n. 7
0
class Crawler:
    """Crawls your onions, but also manages Tor, drives Tor Browser, and uses
    information from your Tor cell log and stem to collect cell sequences."""
    def __init__(
            self,
            take_ownership=True,  # Tor dies when the Crawler does
            torrc_config={"CookieAuth": "1"},
            tor_log="/var/log/tor/tor.log",
            tor_cell_log="/var/log/tor/tor_cell_seq.log",
            control_port=9051,
            socks_port=9050,
            run_in_xvfb=True,
            tbb_path=join("/opt", "tbb", "tor-browser_en-US"),
            tb_log_path=join(_log_dir, "firefox.log"),
            tb_tor_cfg=USE_RUNNING_TOR,
            page_load_timeout=20,
            wait_on_page=5,
            wait_after_closing_circuits=0,
            restart_on_sketchy_exception=True,
            additional_control_fields={},
            db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")
        # Set stem logging level to INFO - "high level library activity"
        stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO)

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(
            config=self.torrc_config, take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)

    def authenticate_to_tor_controlport(self):
        self.logger.info("Authenticating to the tor controlport...")
        try:
            self.controller = Controller.from_port(port=self.control_port)
        except stem.SocketError as exc:
            panic("Unable to connect to tor on port {self.control_port}: "
                  "{exc}".format(**locals()))
        try:
            self.controller.authenticate()
        except stem.connection.MissingPassword:
            panic("Unable to authenticate to tor controlport. Please add "
                  "`CookieAuth 1` to your tor configuration file.")

    def get_control_data(self, page_load_timeout, wait_on_page,
                         wait_after_closing_circuits,
                         additional_control_fields):
        """Gather metadata about the crawler instance."""
        control_data = {}
        # Configuration settings
        control_data["page_load_timeout"] = page_load_timeout
        control_data["wait_on_page"] = wait_on_page
        control_data["wait_after_closing_circuits"] = \
                wait_after_closing_circuits
        if additional_control_fields:
            control_data.update(additional_control_fields)
        # System facts
        control_data["kernel"] = platform.system()
        control_data["kernel_version"] = platform.release()
        control_data["os"] = platform.version()
        control_data["python_version"] = platform.python_version()
        ip = urlopen("https://api.ipify.org").read().decode()
        control_data["ip"] = ip
        # This API seems to be unstable and we haven't found a suitable
        # alternative :(
        try:
            asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip))
            asn_geoip = literal_eval(asn_geoip.read().decode())
            control_data["asn"] = asn_geoip.get("ip").get("as").get("asn")
            control_data["city"] = asn_geoip.get("ip").get("city")
            control_data["country"] = asn_geoip.get("ip").get("country")
        except urllib.error.HTTPError:
            self.logger.warning("Unable to query ASN API and thus some "
                                "control data may be missing from this run.")
        control_data["tor_version"] = self.controller.get_version().version_str
        control_data["tb_version"] = self.tb_driver.tb_version
        # Tor will have multiple entry nodes in its state file, but will
        # choose the first sequential one that is up as its entry guard.
        entry_nodes = self.controller.get_info("entry-guards").split('\n')
        control_data["entry_node"] = next(
            re.search("[0-9A-F]{40}", g).group(0) for g in entry_nodes
            if re.search("up", g))
        control_data["crawler_version"] = _version
        return control_data

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
        return self

    def __del__(self):
        self.close()

    def close(self):
        self.logger.info("Beginning Crawler exit process...")
        if "tb_driver" in dir(self):
            self.logger.info("Closing Tor Browser...")
            self.tb_driver.quit()
        if "virtual_framebuffer" in dir(self):
            self.logger.info("Closing the virtual framebuffer...")
            # A bug in pyvirtualdisplay triggers a KeyError exception when closing a
            # virtual framebuffer if the $DISPLAY environment variable is not set.
            try:
                stop_xvfb(self.virtual_framebuffer)
            except KeyError:
                pass
        if "cell_log" in dir(self):
            self.logger.info("Closing the Tor cell stream...")
            self.cell_log.close()
        if "tor_process" in dir(self):
            self.logger.info("Killing the tor process...")
            self.tor_process.kill()
        self.logger.info("Crawler exit completed.")

    def collect_onion_trace(self,
                            url,
                            hsid=None,
                            extra_fn=None,
                            trace_dir=None,
                            iteration=0):
        """Crawl an onion service and collect a complete cell sequence for the
        activity at the time. Also, record additional information about the
        circuits with stem. Optionally, pass a function to execute additional
        actions after the page has loaded."""
        # Todo: create collect_trace method that works for regular sites as
        # well
        assert ".onion" in url, ("This method is only suitable for crawling "
                                 "onion services.")

        self.logger.info("{url}: closing existing circuits before starting "
                         "crawl.".format(**locals()))
        for circuit in self.controller.get_circuits():
            self.controller.close_circuit(circuit.id)

        sleep(self.wait_after_closing_circuits)

        if not trace_dir:
            trace_dir = self.make_ts_dir()
        trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration)
        trace_path = join(trace_dir, trace_name)

        start_idx = self.get_cell_log_pos()

        try:
            self.crawl_url(url)
            rend_circ_ids = self.get_rend_circ_ids(url)
            if extra_fn:
                self.execute_extra_fn(url, trace_path, start_idx)
        except CrawlerLoggedError:
            return "failed"
        except CrawlerNoRendCircError:
            self.save_debug_log(url, trace_path, start_idx)
            return "failed"
        except:
            self.logger.exception("{url}: unusual exception "
                                  "encountered:".format(**locals()))
            # Also log active circuit info
            self.controller.get_circuits()

            exc_type, exc_value, exc_traceback = exc_info()
            if exc_type in _sketchy_exceptions:
                self.save_debug_log(url, trace_path, start_idx)
                if self.restart_on_sketchy_exception:
                    self.restart_tb()

            return "failed"

        self.logger.info("{url}: saving full trace...".format(**locals()))
        end_idx = self.get_cell_log_pos()
        full_trace = self.get_full_trace(start_idx, end_idx)

        # Save the trace to the database or write to file
        if self.db_handler:
            try:
                new_example = {
                    'hsid': hsid,
                    'crawlid': self.crawlid,
                    't_scrape': get_timestamp("db")
                }
            except NameError:
                panic("If using the database, and calling collect_onion_trace "
                      "directly, you must specify the hsid of the site.")
            exampleid = self.db_handler.add_example(new_example)
            self.db_handler.add_trace(str(full_trace), exampleid)
        else:
            with open(trace_path + "-full", "wb") as fh:
                fh.write(full_trace)

        return "succeeded"

    def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"):
        """Creates a timestamped folder to hold a group of traces."""
        raw_dirpath = join(parent_dir, raw_dir_name)
        ts = get_timestamp("log")
        ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True)
        symlink_cur_to_latest(raw_dirpath, ts)

        with open(join(ts_dir, "control.pickle"), "wb") as fh:
            pickle.dump(self.control_data, fh)

        return ts_dir

    def get_cell_log_pos(self):
        """Returns the current position of the last byte in the Tor cell log."""
        return self.cell_log.seek(0, SEEK_END)

    def crawl_url(self, url):
        """Load a web page in Tor Browser and optionally pass a function
        to execute custom actions on it."""

        self.logger.info("{url}: starting page load...".format(**locals()))

        try:
            self.tb_driver.load_url(url,
                                    wait_on_page=self.wait_on_page,
                                    wait_for_page_body=True)
        except TimeoutException:
            self.logger.warning("{url}: timed out.".format(**locals()))
            raise CrawlerLoggedError
        except http.client.CannotSendRequest:
            self.logger.warning("{url}: cannot send request--improper "
                                "connection state.".format(**locals()))
            raise CrawlerLoggedError

        # Make sure we haven't just hit an error page or nothing loaded
        try:
            if (self.tb_driver.is_connection_error_page
                    or self.tb_driver.current_url == "about:newtab"):
                raise CrawlerReachedErrorPage
        except CrawlerReachedErrorPage:
            self.logger.warning("{url}: reached connection error "
                                "page.".format(**locals()))
            raise CrawlerLoggedError

        self.logger.info("{url}: successfully loaded.".format(**locals()))

    def get_rend_circ_ids(self, url):
        """Returns the rendezvous circuit id(s) associated with a given onion
        service."""
        self.logger.info("{url}: collecting circuit "
                         "information...".format(**locals()))
        active_circs = self.controller.get_circuits()
        rend_circ_ids = set()

        for circ in active_circs:
            if (circ.purpose == "HS_CLIENT_REND" and circ.socks_username
                    and circ.socks_username in url):
                rend_circ_ids.add(circ.id)

        # If everything goes perfect, we should only see one. Multiple indicate
        # the first failed. Zero indicates one closed abruptly (or there's an
        # error with stem--still waiting on data to confirm or deny).
        rend_circ_ct = len(rend_circ_ids)
        self.logger.info(
            "{url}: {rend_circ_ct} associated rendezvous circuits "
            "discovered.".format(**locals()))
        if rend_circ_ct == 0:
            raise CrawlerNoRendCircError

        return rend_circ_ids

    def execute_extra_fn(self, url, trace_path, start_idx):
        self.logger.info("{url}: executing extra function "
                         "code...".format(**locals()))
        extra_fn(self, url, trace_path, start_idx)
        self.logger.info("{url}: extra function executed "
                         "successfully.".format(**locals()))

    def save_debug_log(self, url, trace_path, start_idx):
        self.logger.warning("{url}: saving debug log...".format(**locals()))
        exc_time = self.get_cell_log_pos()
        trace = self.get_full_trace(start_idx, exc_time)
        with open(trace_path + "@debug", "wb") as fh:
            fh.write(trace)

    def get_full_trace(self, start_idx, end_idx):
        """Returns the Tor DATA cells transmitted over a circuit during a
        specified time period."""
        # Sanity check
        assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile "
                                                "position")
        assert end_idx > start_idx, ("logfile section end_idx must come "
                                     "after start_idx")

        self.cell_log.seek(start_idx, SEEK_SET)
        return self.cell_log.read(end_idx - start_idx)

    def restart_tb(self):
        """Restarts the Tor Browser."""
        self.logger.info("Restarting the Tor Browser...")
        self.tb_driver.quit()
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=USE_RUNNING_TOR,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)
        self.logger.info("Tor Browser restarted...")

    def collect_set_of_traces(self,
                              url_set,
                              extra_fn=None,
                              trace_dir=None,
                              iteration=0,
                              shuffle=True,
                              retry=True,
                              url_to_id_mapping=None):
        """Collect a set of traces."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = url_set
            trace_dir = None
        elif not trace_dir:
            trace_dir = self.make_ts_dir()

        set_size = len(url_set)
        self.logger.info("Saving set of {set_size} traces to "
                         "{trace_dir}.".format(**locals()))

        # Converts both sets (from pickle files) and dicts (whose keys are
        # URLs--from database) to URL lists
        url_set = list(url_set)
        if shuffle:
            random.shuffle(url_set)

        failed_urls = []

        for url_idx in range(set_size):
            self.logger.info("Collecting trace {} of "
                             "{set_size}...".format(url_idx + 1, **locals()))
            url = url_set[url_idx]
            if self.db_handler:
                hsid = url_to_id_mapping[url]
            else:
                hsid = None

            if (self.collect_onion_trace(url,
                                         hsid=hsid,
                                         extra_fn=extra_fn,
                                         trace_dir=trace_dir,
                                         iteration=iteration) == "failed"
                    and retry):
                failed_urls.append(url)

        if failed_urls:
            failed_ct = len(failed_urls)
            self.logger.info("Retrying {failed_ct} of {set_size} traces that "
                             "failed.".format(**locals()))
            self.collect_set_of_traces(failed_urls,
                                       extra_fn=extra_fn,
                                       trace_dir=trace_dir,
                                       iteration=iteration,
                                       shuffle=shuffle,
                                       retry=False,
                                       url_to_id_mapping=url_to_id_mapping)

    def crawl_monitored_nonmonitored(self,
                                     monitored_class,
                                     nonmonitored_class,
                                     extra_fn=None,
                                     shuffle=True,
                                     retry=True,
                                     monitored_name="monitored",
                                     nonmonitored_name="nonmonitored",
                                     url_to_id_mapping=None,
                                     ratio=40):
        """Crawl a monitored class ratio times interspersed between the
        crawling of a(n ostensibly larger) non-monitored class."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = nonmonitored_class
                url_to_id_mapping.update(monitored_class)
            trace_dir, mon_trace_dir, nonmon_trace_dir = (None, ) * 3
        else:
            trace_dir = self.make_ts_dir()
            mon_trace_dir = join(trace_dir, monitored_name)
            mkdir(mon_trace_dir)
            nonmon_trace_dir = join(trace_dir, nonmonitored_name)
            mkdir(nonmon_trace_dir)

        # db: calling list on a dict returns a list of its keys (URLs)
        # pickle: calling list on set is necessary to make it shuffleable
        nonmonitored_class = list(nonmonitored_class)
        monitored_class = list(monitored_class)

        nonmonitored_class_ct = len(nonmonitored_class)
        chunk_size = int(nonmonitored_class_ct / ratio)

        if shuffle:
            random.shuffle(nonmonitored_class)
            random.shuffle(monitored_class)

        for iteration in range(ratio):
            self.logger.info("Beginning iteration {i} of {ratio} in the "
                             "{monitored_name} class".format(i=iteration + 1,
                                                             **locals()))
            self.collect_set_of_traces(monitored_class,
                                       trace_dir=mon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)

            slice_lb = iteration * chunk_size
            slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct)
            self.logger.info("Crawling services {} through {slice_ub} of "
                             "{nonmonitored_class_ct} in the "
                             "{nonmonitored_name} "
                             "class".format(slice_lb + 1, **locals()))
            self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub],
                                       trace_dir=nonmon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)
Esempio n. 8
0
#!/usr/bin/env python3
from tbselenium.tbdriver import TorBrowserDriver
import pickle

tbpath = "tor-browser_en-US"

with open('onions.sav', 'rb') as f:
    potential_onions = pickle.load(f)

print("Loaded {} onions".format(len(potential_onions)))

driver = TorBrowserDriver(tbpath)
driver.set_page_load_timeout(60)

good_onions = []

for onion in potential_onions:
    try:
        driver.load_url(onion)
        good_onions.append(onion)
    except Exception as e:
        print(e)

print("Good onions")
for onion in good_onions:
    print(onion)

with open('good-onions.sav', 'wb') as f:
    pickle.dump(good_onions, f)
class Crawler:
    """Crawls your onions, but also manages Tor, drives Tor Browser, and uses
    information from your Tor cell log and stem to collect cell sequences."""
    def __init__(self,
                 take_ownership=True, # Tor dies when the Crawler does
                 torrc_config={"CookieAuth": "1"},
                 tor_log="/var/log/tor/tor.log",
                 tor_cell_log="/var/log/tor/tor_cell_seq.log",
                 control_port=9051,
                 socks_port=9050,
                 run_in_xvfb=True,
                 tbb_path=join("/opt","tbb","tor-browser_en-US"),
                 tb_log_path=join(_log_dir,"firefox.log"),
                 tb_tor_cfg=USE_RUNNING_TOR,
                 page_load_timeout=20,
                 wait_on_page=5,
                 wait_after_closing_circuits=0,
                 restart_on_sketchy_exception=True,
                 additional_control_fields={},
                 db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")
        # Set stem logging level to INFO - "high level library activity"
        stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO)

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(config=self.torrc_config,
                                                  take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)


    def authenticate_to_tor_controlport(self):
        self.logger.info("Authenticating to the tor controlport...")
        try:
            self.controller = Controller.from_port(port=self.control_port)
        except stem.SocketError as exc:
            panic("Unable to connect to tor on port {self.control_port}: "
                  "{exc}".format(**locals()))
        try:
            self.controller.authenticate()
        except stem.connection.MissingPassword:
            panic("Unable to authenticate to tor controlport. Please add "
                  "`CookieAuth 1` to your tor configuration file.")


    def get_control_data(self, page_load_timeout, wait_on_page,
                         wait_after_closing_circuits,
                         additional_control_fields):
        """Gather metadata about the crawler instance."""
        control_data = {}
        # Configuration settings
        control_data["page_load_timeout"] = page_load_timeout
        control_data["wait_on_page"] = wait_on_page
        control_data["wait_after_closing_circuits"] = \
                wait_after_closing_circuits
        if additional_control_fields:
            control_data.update(additional_control_fields)
        # System facts
        control_data["kernel"] = platform.system()
        control_data["kernel_version"] = platform.release()
        control_data["os"] = platform.version()
        control_data["python_version"] = platform.python_version()
        ip = urlopen("https://api.ipify.org").read().decode()
        control_data["ip"] = ip
        # This API seems to be unstable and we haven't found a suitable
        # alternative :(
        try:
            asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip))
            asn_geoip = literal_eval(asn_geoip.read().decode())
            control_data["asn"] = asn_geoip.get("ip").get("as").get("asn")
            control_data["city"] = asn_geoip.get("ip").get("city")
            control_data["country"] = asn_geoip.get("ip").get("country")
        except urllib.error.HTTPError:
            self.logger.warning("Unable to query ASN API and thus some "
                                "control data may be missing from this run.")
        control_data["tor_version"] = self.controller.get_version().version_str
        control_data["tb_version"] = self.tb_driver.tb_version
        # Tor will have multiple entry nodes in its state file, but will
        # choose the first sequential one that is up as its entry guard.
        entry_nodes = self.controller.get_info("entry-guards").split('\n')
        control_data["entry_node"] = next(re.search("[0-9A-F]{40}", g).group(0)
                                          for g in entry_nodes
                                          if re.search("up", g))
        control_data["crawler_version"] = _version
        return control_data


    def __enter__(self):
        return self


    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
        return self


    def __del__(self):
        self.close()


    def close(self):
        self.logger.info("Beginning Crawler exit process...")
        if "tb_driver" in dir(self):
            self.logger.info("Closing Tor Browser...")
            self.tb_driver.quit()
        if "virtual_framebuffer" in dir(self):
            self.logger.info("Closing the virtual framebuffer...")
	    # A bug in pyvirtualdisplay triggers a KeyError exception when closing a
            # virtual framebuffer if the $DISPLAY environment variable is not set.
            try:
                stop_xvfb(self.virtual_framebuffer)
            except KeyError:
                pass
        if "cell_log" in dir(self):
            self.logger.info("Closing the Tor cell stream...")
            self.cell_log.close()
        if "tor_process" in dir(self):
            self.logger.info("Killing the tor process...")
            self.tor_process.kill()
        self.logger.info("Crawler exit completed.")


    def collect_onion_trace(self, url, hsid=None, extra_fn=None, trace_dir=None,
                            iteration=0):
        """Crawl an onion service and collect a complete cell sequence for the
        activity at the time. Also, record additional information about the
        circuits with stem. Optionally, pass a function to execute additional
        actions after the page has loaded."""
        # Todo: create collect_trace method that works for regular sites as
        # well
        assert ".onion" in url, ("This method is only suitable for crawling "
                                 "onion services.")

        self.logger.info("{url}: closing existing circuits before starting "
                         "crawl.".format(**locals()))
        for circuit in self.controller.get_circuits():
            self.controller.close_circuit(circuit.id)

        sleep(self.wait_after_closing_circuits)

        if not trace_dir:
            trace_dir = self.make_ts_dir()
        trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration)
        trace_path = join(trace_dir, trace_name)

        start_idx = self.get_cell_log_pos()

        try:
            self.crawl_url(url)
            rend_circ_ids = self.get_rend_circ_ids(url)
            if extra_fn:
                self.execute_extra_fn(url, trace_path, start_idx)
        except CrawlerLoggedError:
            return "failed"
        except CrawlerNoRendCircError:
            self.save_debug_log(url, trace_path, start_idx)
            return "failed"
        except:
            self.logger.exception("{url}: unusual exception "
                                  "encountered:".format(**locals()))
            # Also log active circuit info
            self.controller.get_circuits()

            exc_type, exc_value, exc_traceback = exc_info()
            if exc_type in _sketchy_exceptions:
                self.save_debug_log(url, trace_path, start_idx)
                if self.restart_on_sketchy_exception:
                    self.restart_tb()

            return "failed"

        self.logger.info("{url}: saving full trace...".format(**locals()))
        end_idx = self.get_cell_log_pos()
        full_trace = self.get_full_trace(start_idx, end_idx)

        # Save the trace to the database or write to file
        if self.db_handler:
            try:
                new_example = {'hsid': hsid,
                               'crawlid': self.crawlid,
                               't_scrape': get_timestamp("db")}
            except NameError:
                panic("If using the database, and calling collect_onion_trace "
                      "directly, you must specify the hsid of the site.")
            exampleid = self.db_handler.add_example(new_example)
            self.db_handler.add_trace(str(full_trace), exampleid)
        else:
            with open(trace_path+"-full", "wb") as fh:
                fh.write(full_trace)

        return "succeeded"


    def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"):
        """Creates a timestamped folder to hold a group of traces."""
        raw_dirpath = join(parent_dir, raw_dir_name)
        ts = get_timestamp("log")
        ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True)
        symlink_cur_to_latest(raw_dirpath, ts)

        with open(join(ts_dir, "control.pickle"), "wb") as fh:
            pickle.dump(self.control_data, fh)

        return ts_dir


    def get_cell_log_pos(self):
        """Returns the current position of the last byte in the Tor cell log."""
        return self.cell_log.seek(0, SEEK_END)


    def crawl_url(self, url):
        """Load a web page in Tor Browser and optionally pass a function
        to execute custom actions on it."""

        self.logger.info("{url}: starting page load...".format(**locals()))

        try:
            self.tb_driver.load_url(url, wait_on_page=self.wait_on_page,
                                    wait_for_page_body=True)
        except TimeoutException:
            self.logger.warning("{url}: timed out.".format(**locals()))
            raise CrawlerLoggedError
        except http.client.CannotSendRequest:
            self.logger.warning("{url}: cannot send request--improper "
                                "connection state.".format(**locals()))
            raise CrawlerLoggedError

        # Make sure we haven't just hit an error page or nothing loaded
        try:
            if (self.tb_driver.is_connection_error_page
                or self.tb_driver.current_url == "about:newtab"):
                raise CrawlerReachedErrorPage
        except CrawlerReachedErrorPage:
            self.logger.warning("{url}: reached connection error "
                                "page.".format(**locals()))
            raise CrawlerLoggedError

        self.logger.info("{url}: successfully loaded.".format(**locals()))


    def get_rend_circ_ids(self, url):
        """Returns the rendezvous circuit id(s) associated with a given onion
        service."""
        self.logger.info("{url}: collecting circuit "
                         "information...".format(**locals()))
        active_circs = self.controller.get_circuits()
        rend_circ_ids = set()

        for circ in active_circs:
            if (circ.purpose == "HS_CLIENT_REND" and
                circ.socks_username and
                circ.socks_username in url):
                rend_circ_ids.add(circ.id)

        # If everything goes perfect, we should only see one. Multiple indicate
        # the first failed. Zero indicates one closed abruptly (or there's an
        # error with stem--still waiting on data to confirm or deny).
        rend_circ_ct = len(rend_circ_ids)
        self.logger.info("{url}: {rend_circ_ct} associated rendezvous circuits "
                         "discovered.".format(**locals()))
        if rend_circ_ct == 0:
            raise CrawlerNoRendCircError

        return rend_circ_ids


    def execute_extra_fn(self, url, trace_path, start_idx):
        self.logger.info("{url}: executing extra function "
                         "code...".format(**locals()))
        extra_fn(self, url, trace_path, start_idx)
        self.logger.info("{url}: extra function executed "
                         "successfully.".format(**locals()))


    def save_debug_log(self, url, trace_path, start_idx):
        self.logger.warning("{url}: saving debug log...".format(**locals()))
        exc_time = self.get_cell_log_pos()
        trace = self.get_full_trace(start_idx, exc_time)
        with open(trace_path + "@debug", "wb") as fh:
            fh.write(trace)



    def get_full_trace(self, start_idx, end_idx):
        """Returns the Tor DATA cells transmitted over a circuit during a
        specified time period."""
        # Sanity check
        assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile "
                                                "position")
        assert end_idx > start_idx, ("logfile section end_idx must come "
                                     "after start_idx")

        self.cell_log.seek(start_idx, SEEK_SET)
        return self.cell_log.read(end_idx - start_idx)


    def restart_tb(self):
        """Restarts the Tor Browser."""
        self.logger.info("Restarting the Tor Browser...")
        self.tb_driver.quit()
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=USE_RUNNING_TOR,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)
        self.logger.info("Tor Browser restarted...")


    def collect_set_of_traces(self, url_set, extra_fn=None, trace_dir=None,
                              iteration=0, shuffle=True, retry=True,
                              url_to_id_mapping=None):
        """Collect a set of traces."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = url_set
            trace_dir = None
        elif not trace_dir:
                trace_dir = self.make_ts_dir()

        set_size = len(url_set)
        self.logger.info("Saving set of {set_size} traces to "
                         "{trace_dir}.".format(**locals()))

        # Converts both sets (from pickle files) and dicts (whose keys are
        # URLs--from database) to URL lists
        url_set = list(url_set)
        if shuffle:
            random.shuffle(url_set)

        failed_urls = []

        for url_idx in range(set_size):
            self.logger.info("Collecting trace {} of "
                             "{set_size}...".format(url_idx+1, **locals()))
            url = url_set[url_idx]
            if self.db_handler:
                hsid = url_to_id_mapping[url]
            else:
                hsid = None

            if (self.collect_onion_trace(url, hsid=hsid, extra_fn=extra_fn,
                                         trace_dir=trace_dir,
                                         iteration=iteration) == "failed"
                and retry):
                failed_urls.append(url)

        if failed_urls:
            failed_ct = len(failed_urls)
            self.logger.info("Retrying {failed_ct} of {set_size} traces that "
                             "failed.".format(**locals()))
            self.collect_set_of_traces(failed_urls, extra_fn=extra_fn,
                                       trace_dir=trace_dir,
                                       iteration=iteration, shuffle=shuffle,
                                       retry=False,
                                       url_to_id_mapping=url_to_id_mapping)


    def crawl_monitored_nonmonitored(self, monitored_class, nonmonitored_class,
                                     extra_fn=None, shuffle=True, retry=True,
                                     monitored_name="monitored",
                                     nonmonitored_name="nonmonitored",
                                     url_to_id_mapping=None, ratio=40):
        """Crawl a monitored class ratio times interspersed between the
        crawling of a(n ostensibly larger) non-monitored class."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = nonmonitored_class
                url_to_id_mapping.update(monitored_class)
            trace_dir, mon_trace_dir, nonmon_trace_dir = (None,) * 3
        else:
            trace_dir = self.make_ts_dir()
            mon_trace_dir = join(trace_dir, monitored_name)
            mkdir(mon_trace_dir)
            nonmon_trace_dir = join(trace_dir, nonmonitored_name)
            mkdir(nonmon_trace_dir)

        # db: calling list on a dict returns a list of its keys (URLs)
        # pickle: calling list on set is necessary to make it shuffleable
        nonmonitored_class = list(nonmonitored_class)
        monitored_class = list(monitored_class)

        nonmonitored_class_ct = len(nonmonitored_class)
        chunk_size = int(nonmonitored_class_ct / ratio)

        if shuffle:
            random.shuffle(nonmonitored_class)
            random.shuffle(monitored_class)

        for iteration in range(ratio):
            self.logger.info("Beginning iteration {i} of {ratio} in the "
                             "{monitored_name} class".format(i=iteration+1,
                                                             **locals()))
            self.collect_set_of_traces(monitored_class,
                                       trace_dir=mon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)

            slice_lb = iteration * chunk_size
            slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct)
            self.logger.info("Crawling services {} through {slice_ub} of "
                             "{nonmonitored_class_ct} in the "
                             "{nonmonitored_name} "
                             "class".format(slice_lb + 1, **locals()))
            self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub],
                                       trace_dir=nonmon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)
Esempio n. 10
0
class DescargarPdf:
    def __init__(self):
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.usuario = []
        self.contraseñaTxT = []
        self.conversor = '?convertedTo=pdf'

    def iniciarTor(self):
        self.zLibraty = TorBrowserDriver(self.tbb_dir,
                                         tbb_logfile_path='test.log')

    def iniciarSecion(self):
        self.zLibraty.refresh()
        sleep(10)
        self.element = self.zLibraty.find_element_by_name("email")
        self.element.send_keys(self.correo)
        sleep(2)
        self.element2 = self.zLibraty.find_elements_by_class_name(
            "form-control")[1]
        self.element2.send_keys(self.contraseña)
        self.element2.send_keys(Keys.RETURN)

    def paginaDescargas(self):
        self.zLibraty.load_url(self.url)
        self.html = self.zLibraty.page_source

    def paginaPrinsipal(self, añoInicial, añoFinal):
        self.urlAños = 'http://zlibraryexau2g3p.onion/s/?yearFrom=' + str(
            añoInicial) + '&yearTo=' + str(añoFinal)
        self.url = self.urlAños

    def cambiarPagina(self, x):
        self.url += '&page=' + str(x)

    def Crearcsv(self):
        print("hola")
        self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url'
        try:
            os.mkdir(self.carpetaUrl)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        self.escrivirUrlWed = csv.writer(
            open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv',
                 'w'))
        self.imprimirUrlPdf = csv.writer(
            open(
                '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv',
                'w'))

    def credenciales(self, numeroUsuario):
        self.correo = self.usuario[numeroUsuario]
        self.contraseña = self.contraseñaTxT[numeroUsuario]
        self.urlLoguin = 'http://zlibraryexau2g3p.onion'
        self.zLibraty.load_url(self.urlLoguin)

    def UsuariosYcontraseñas(self):
        self.dir = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/descargarLIbros/descargarparte1/contraseñasYcorreos.txt'
        self.data = open(self.dir, 'r+')
        for self.i in range(0, 200):
            if self.i % 2 == 0:
                self.usuario.append(self.data.readline())
            if self.i % 2 != 0:
                self.contraseñaTxT.append(self.data.readline())

    def urlPdf(self, contador, _contadorusuarios):
        self.boleanoPdf = 0
        self.contadorUsuariosCon = _contadorusuarios
        self.contadorLibros2 = 0
        self.contadorLibros = 0
        self.Crearcsv()
        self.soup = BeautifulSoup(self.html, 'html.parser')
        for self.urlwed in self.soup.find_all(itemprop="name"):
            self.contador = 0
            self.urlwed = self.urlwed.find('a', href=re.compile(''))
            self.urlDowload = self.urlwed.get('href')
            self.urlpdfGeleneralH = re.sub('/book/', 'https://b-ok.cc/book/',
                                           self.urlDowload)
            self.urlDowload = re.sub('/book/',
                                     'http://zlibraryexau2g3p.onion/book/',
                                     self.urlDowload)
            self.escrivirUrlWed.writerow([self.urlDowload])
            print(self.urlDowload)
            self.contadorLibros += 1
            self.contadorLibros2 += 1
            if self.contadorLibros2 == 10:
                self.contador += 1
                self.serrarTor()
                sleep(4)
                self.iniciarTor()
                self.contadorUsuariosCon += 1
                print(self.contadorUsuariosCon)
                self.credenciales(contadorusuarios)
                self.iniciarSecion()
                sleep(7)
                self.contadorLibros2 = 0
                sleep(15)
                if self.contador == 5:
                    self.contador = 0
            voleano = validarFormato(self.urlpdfGeleneralH)
            for self.urlRedirec in range(0, 1):
                self.zLibraty.load_url(self.urlDowload)
                sleep(5)
                self.htmlPdf = self.zLibraty.page_source
                self.soupRedirec = BeautifulSoup(self.htmlPdf, 'html.parser')
                self.urlDowloadPDF = self.soupRedirec.find(
                    class_="btn btn-primary dlButton addDownloadedBook")
                self.urlDowloadPDF = self.urlDowloadPDF.get('href')
                self.urlDowloadPDF = re.sub(
                    '/dl/', 'http://zlibraryexau2g3p.onion/dl/',
                    self.urlDowloadPDF)
                self.imprimirUrlPdf.writerow([self.urlDowloadPDF])
                print(self.urlDowloadPDF)
                if voleano == True:
                    self.zLibraty.get(self.urlDowloadPDF)
                    voleano = False
                else:
                    self.convertirpdf = str(self.urlDowloadPDF) + str(
                        self.conversor)
                    self.zLibraty.get(self.convertirpdf)
                sleep(20)
                tiempoDescarga()
                informaiconPDf(self.urlpdfGeleneralH)

    def DescargarContenido(self, _html):
        self.contenido = _html

    def serrarTor(self):
        self.zLibraty.close()
Esempio n. 11
0
#!/usr/bin/env python3
from tbselenium.tbdriver import TorBrowserDriver
import time

from sys import argv, exit

tbpath = "tor-browser_en-US"

if len(argv) == 1:
    website = "about:blank"
else:
    website = argv[1]

driver = TorBrowserDriver(tbpath)
driver.set_page_load_timeout(90)
try:
    driver.load_url(website)
except Exception as e:
    print(e)
    driver.quit()
    exit(1)

time.sleep(1)

driver.quit()
exit(0)
Esempio n. 12
0
# driver.add_cookie({'name': 'APISID', 'value': '0Hux_fPcVPDg5XZv/AGRW-QnPZsMClW2UL'})
# driver.add_cookie({'name': 'SAPISID', 'value': 'GzIWigexS_D7_NR0/AE6bq0GbXVCl9ErBX'})
# driver.add_cookie({'name': 'CONSENT', 'value': 'YES+US.en+20161213-01-0'})
# driver.add_cookie({'name': 'NID', 'value': '181=wpliWgjEbH5Fdsn8rZxR14StlzVdjRdAXf2p4bOXrkrgMI2Jl-TiSwmy0SVpbNXRVXTFsn5hXi8dThbwouwWYKTJK5Ih_y1olVCvWoiATKJKe_5AghBKGxiCBlVVwoXmcVCa2tk4BGuiF4DCrJ6wZI0EQAOl9OHTu_VsRVuNAXW03NtCEStaTqKXhQnC2Hh0sNlB4_IedlEbW35i'})
# driver.add_cookie({'name': '1P_JAR', 'value': '2019-4-23-3'})
# driver.add_cookie({'name': 'DV', 'value': 'o1Nbl6B8jsZRELXrd6iumDQIQaWCpNYGGjFTFCYbPQAAAOB85pD-5LPtOQAAAOwZ_awmB2dqGAAAAAKrLPZJ7nerCgAAAA'})
# driver.add_cookie({'name': 'SIDCC', 'value': 'AN0-TYsIc9ao8HSl8ErVpCFrEf0JYQbOHu-ttenPp8mfKx-rY-Z6GPOJqgW0snerz0czS1As5w'})

# url = "https://check.torproject.org"
url = "https://www.google.com/search?q=playoffs"

headers = {}
headers["User-agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"


driver.load_url(url)
# driver.context()

# Find the element for success
element = driver.find_element_by_class_name('LC20lb')
print(element)
sleep(2)  # So that we can see the page

driver.close()


# class TestSite(unittest.TestCase):
#     def setUp(self):
#         # Point the path to the tor-browser_en-US directory in your system
#         tbpath = '/home/andrew/Desktop/tor-browser-linux64-8.0.8_en-US/tor-browser_en-US/'
#         self.driver = TorBrowserDriver(tbpath, tbb_logfile_path='test.log', tor_cfg=cm.USE_STEM)
Esempio n. 13
0
class crearCorros:
    def __init__(self):
        self.urlProtocoe = 'http://3g2upl4pq6kufc4m.onion', 'https://mail.protonmail.com/create/new', 'https://singlelogin.org/registration.php'
        print(self.urlProtocoe[2])
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.protocoe = TorBrowserDriver(self.tbb_dir,
                                         tbb_logfile_path='test.log')
        self.dirNombre = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/nombre.txt'
        self.nombre = open(self.dirNombre, 'r+')
        self.dirapellido = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/apellidos.txt'
        self.apellido = open(self.dirapellido, 'r+')
        self.dirContrasenna = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/contraseña.txt'
        self.contrasenna = open(self.dirContrasenna, 'r+')
        self.dirCotrasenna2 = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/contraseña2.txt'
        self.Contrasenna2 = open(self.dirCotrasenna2, 'r+')
        self.datosContrasenna = []
        self.lista = []
        for self.d in range(0, 101):
            self.lista.append(self.nombre.readline() + 'asdsdf')
            self.datosContrasenna.append(self.contrasenna.readline() +
                                         self.Contrasenna2.readline())
        for self.d in range(0, 100):
            self.lista[self.d] = re.sub('\n', 'asdaawderca',
                                        self.lista[self.d])
            self.datosContrasenna[self.d] = re.sub(
                '\n', 'radabanals', self.datosContrasenna[self.d])
            self.lista[self.d] = re.sub(
                r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+",
                r"\1", normalize("NFD", self.lista[self.d]), 0, re.I)
            self.lista[self.d] = normalize('NFC', self.lista[self.d])
            self.datosContrasenna[self.d] = re.sub(
                r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+",
                r"\1", normalize("NFD",
                                 self.datosContrasenna[self.d]), 0, re.I)
            self.datosContrasenna[self.d] = normalize(
                'NFC', self.datosContrasenna[self.d])

    def iniciarTor(self):
        self.protocoe.load_url(self.urlProtocoe[2])

    def ingresarDatos(self, fila):
        self.pasword = self.protocoe.find_element_by_name("password")
        self.pasword.click()
        sleep(random.uniform(1.0, 4))
        self.pasword.send_keys(self.datosContrasenna[fila])
        self.pasword = self.protocoe.find_element_by_name("passwordc")
        sleep(random.uniform(1.0, 3))
        self.pasword.click()
        self.pasword.send_keys(self.datosContrasenna[fila])
        sleep(random.uniform(2.0, 5.7))
        self.iframes = self.protocoe.find_element_by_tag_name("iframe")
        self.protocoe.switch_to.frame(self.iframes)
        self.usuario = self.protocoe.find_element_by_xpath('//input')
        self.usuario.click()
        self.usuario.send_keys(self.lista[fila])
        sleep(random.uniform(0, 5))
        self.usuario.send_keys(Keys.ENTER)
        self.protocoe.switch_to.default_content()
        sleep(20)
        self.enter = self.protocoe.find_element_by_xpath(
            '//button[@class="pm_button primary modal-footer-button"]')
        self.enter.click()

    def serrarTor(self):
        self.protocoe.close()

    def imprimirDatos(self):
        for d in range(0, 100):
            print(self.lista[d])
            print(self.datosContrasenna[d])
Esempio n. 14
0
#!/usr/bin/env python3
from tbselenium.tbdriver import TorBrowserDriver
import time

from sys import argv, exit

tbpath = "tor-browser_en-US"

if len(argv) == 1:
    file = "good-onions.txt"
else:
    file = argv[1]

with open(file, "r") as f:
    pages = f.read().split("\n")

driver = TorBrowserDriver(tbpath)
driver.set_page_load_timeout(90)

for page in pages:
    if len(page) == 0:
        continue
    try:
        driver.load_url("http://{}".format(page))
        driver.save_screenshot('shots/{}.png'.format(page))
    except Exception as e:
        print("Failed", page)

driver.quit()
exit(0)
Esempio n. 15
0
def loggin(ema, pas):
    try:
        browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
    except:
        # selenium.common.exceptions.WebDriverException: Message: Access is denied. (os error 5)
        # mozilla is updating
        print('probably updating sleep 30')
        sleep(30)
        browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
    # connect to site
    try:
        browser.load_url(
            "https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F",
            wait_on_page=5,
            wait_for_page_body=True)

    except:
        # selenium.common.exceptions.NoSuchWindowException: Message: Browsing context has been discarded
        try:
            browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)
        except:
            # selenium.common.exceptions.WebDriverException: Message: Access is denied. (os error 5)
            # mozilla is updating
            print('probably updating sleep 30')
            sleep(30)
            browser = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM)

        browser.load_url(
            "https://www.udemy.com/join/login-popup/?locale=en_US&response_type=html&next=https%3A%2F%2Fwww.udemy.com%2F",
            wait_on_page=5,
            wait_for_page_body=True)

    # reg_el.click()
    # maximise
    browser.maximize_window()
    # Scroll
    browser.execute_script("window.scrollTo(0,100)")
    try:
        email_el = browser.find_element_by_id("email--1")
    except:
        sleep(10)
        try:
            email_el = browser.find_element_by_id("email--1")
        except:
            return False
    email_el.send_keys(ema)
    # enter password
    pass_el = browser.find_element_by_id("id_password")
    pass_el.send_keys(pas)
    # find submit link
    sub_el = browser.find_element_by_id('submit-id-submit')
    # click submit
    sub_el.click()
    sleep(2)
    # check
    try:
        avatar = browser.find_element_by_id('u711-popover-trigger--18')
    except:
        avatar = None

    if avatar:
        return browser
    elif 'udemy.com' in browser.current_url:
        return browser
    else:
        return None
Esempio n. 16
0
class DescargarPdf:
    def __init__(self):
        self.contadorCredenciales=0
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.usuario=[]
        self.contraseñaTxT=[]
        self.conversor='?convertedTo=pdf'
    def iniciarTor(self):
        self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log')
    def iniciarSecion(self):
        self.element=self.zLibraty.find_element_by_name("email")
        self.element.send_keys(self.correo)
        sleep(2)
        self.element2=self.zLibraty.find_elements_by_class_name("form-control")[1]
        self.element2.send_keys(self.contraseña)
        self.element2.send_keys(Keys.RETURN)
    def paginaDescargas(self):
        print("estoy en la funcion paginaDescagas")
        self.zLibraty.load_url(self.url)
        sleep(4)
        self.html=self.zLibraty.page_source
    def paginaPrinsipal(self,añoInicial,añoFinal):
        self.urlAños='http://zlibraryexau2g3p.onion/s/?yearFrom='+str(añoInicial)+'&yearTo='+str(añoFinal)
        self.url=self.urlAños  
    def cambiarPagina(self,x):
        print("estoy en cambiar pagina prinsipal")
        self.url+='&page='+str(x)
        print(self.url)
    def Crearcsv(self):
        desde=datosDescarga(1)
        asta=datosDescarga(2)
        self.carpetaUrl='/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url'
        try :
             os.mkdir(self.carpetaUrl)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        self.escrivirUrlWed=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/url2.csv','w'))
        self.imprimirUrlPdf=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/urlDowload2.csv','w'))
    def credenciales(self,numeroUsuario):
        print("llegue")
        if self.contadorCredenciales==0 or self.contadorCredenciales==20:
            self.zLibraty.load_url("https://singlelogin.org/")
            self.zLibraty.find_element_by_name("redirectToHost").click()
            sleep(3)
            pyautogui.press("down")
            sleep(2)
            pyautogui.press("down")
            sleep(1)
            pyautogui.press("enter")
        sleep(5)
        self.correo=self.usuario[numeroUsuario]
        self.contraseña=self.contraseñaTxT[numeroUsuario]
    def UsuariosYcontraseñas(self):
        self.dir='/home/dd/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt'
        self.data=open(self.dir,'r+')
        for self.i in range(0,200):
            if self.i%2==0 :
                self.usuario.append(self.data.readline())
            if self.i%2!=0:
                self.contraseñaTxT.append(self.data.readline())
    def urlPdf(self,):
        self.contadorCredenciales=1
        self.boleanoPdf=0
        self.respaldoContador=0
        self.contadorUsuarios=usuarioUsadosLeer()
        self.contadorLibros=datosDescarga(4)
        self.contadorLibros2=self.contadorLibros%10
        self.Crearcsv()
        self.soup=BeautifulSoup(self.html,'html.parser')
        try:
            for self.urlwed in self.soup.find_all(itemprop = "name") :
                self.contador=0
                self.urlwed=self.urlwed.find('a',href=re.compile(''))
                self.urlDowload=self.urlwed.get('href')
                self.urlpdfGeleneralH=re.sub('/book/','https://b-ok.cc/book/',self.urlDowload)
                self.urlDowload=re.sub('/book/','http://zlibraryexau2g3p.onion/book/',self.urlDowload)
                self.escrivirUrlWed.writerow([self.urlDowload])
                print(self.urlDowload)
                self.voleano=validarFormato(self.urlpdfGeleneralH)
                guardarNumeroDescargas(self.contadorLibros) 
                print(self.respaldoContador) 
                if self.contadorLibros==self.respaldoContador:
                    for self.urlRedirec in range(0,1):
                        self.zLibraty.load_url(self.urlDowload)
                        sleep(5)
                        self.htmlPdf=self.zLibraty.page_source
                        self.soupRedirec=BeautifulSoup(self.htmlPdf,'html.parser')
                        self.urlDowloadPDF=self.soupRedirec.find(class_="btn btn-primary dlButton addDownloadedBook")
                        self.urlDowloadPDF=self.urlDowloadPDF.get('href')
                        self.urlDowloadPDF=re.sub('/dl/','http://zlibraryexau2g3p.onion/dl/',self.urlDowloadPDF)
                        self.imprimirUrlPdf.writerow([self.urlDowloadPDF])
                        print(self.urlDowloadPDF)
                        print("vamos a por el if")
                        sleep(15)
                        if self.voleano==True:
                            self.zLibraty.set_page_load_timeout(12)
                            try:
                                self.zLibraty.load_url(self.urlDowloadPDF)
                            except:
                                sleep(5)
                                self.zLibraty.set_page_load_timeout(7000)
                                print("funciona PDF ")                                
                            self.voleano=False
                            sleep(5)
                            self.contadorLibros+=1
                            self.contadorLibros2+=1
                        else:                          
                            self.zLibraty.set_page_load_timeout(12)
                            try:
                                self.zLibraty.load_url(self.urlDowloadPDF)
                            except:
                                sleep(8)
                                pyautogui.press("down")
                                sleep(2)
                                pyautogui.press("enter")
                            self.zLibraty.set_page_load_timeout(7000)
                            sleep(5)
                            self.contadorLibros+=1
                            self.contadorLibros2+=1
                        self.zLibraty.load_url("about:downloads")
                        self.datosEsperaDescarga()
                        self.peticiones()
                        self.zLibraty.back()
                        informaiconPdf(self.urlpdfGeleneralH)
                        guardarNumeroDescargas(self.contadorLibros)
                self.respaldoContador+=1                   
                if self.contadorLibros==self.respaldoContador:
                    if self.contadorLibros2%10==0:
                        print((self.contadorLibros2-1)%10)
                        self.contador+=1
                        if self.contadorLibros==20:
                            self.contadorCredenciales=20
                            print("saliendo de secion¡¡¡¡¡¡")
                            pyautogui.moveTo(1707,245)
                            pyautogui.hotkey("ctrl","shift","u")
                            sleep(2)
                            pyautogui.press("enter")
                            sleep(7)
                            pyautogui.press("enter")
                            sleep(15)
                        else:
                            print("saliendo de secion")
                            self.zLibraty.get("http://zlibraryexau2g3p.onion/logout.php")          
                        self.contadorUsuarios+=1
                        print(self.contadorUsuarios)
                        try:
                            self.zLibraty.switch_to_window(self.zLibraty.window_handles[0])
                        except:
                            print("error al cambian de  ventana")
                       
                        usuarioUsadosReescrivir(self.contadorUsuarios)
                        print("por aqui¿¿¿¿¿¿")
                        self.credenciales(self.contadorUsuarios)
                        self.contadorCredenciales=1
                        print("no por aqui¿¿¿¿¿¿")
                        sleep(20)
                        self.iniciarSecion()
                        sleep(15)
                        self.paginaDescargas()
                        sleep(7)
                        self.contadorLibros2=0
                        sleep(15)
                        print("numero de li bros por usuario ",self.contadorLibros2)
                        if self.contador==5:
                            self.contador=0  
        except OSError as e :
            print(e.strerror)
            print("error en la urlPdf:::::")
            guardarNumeroDescargas(self.contadorLibros)
            usuarioUsadosReescrivir(self.contadorUsuarios)
            print(self.contadorLibros)
            archivos=int(contarNueroArchivos())
            print(archivos)
            self.zLibraty.load_url("about:downloads")
            self.datosEsperaDescarga()
            self.peticiones()
            self.zLibraty.back()
            informaiconPdf(self.urlpdfGeleneralH)
    def DescargarContenido(self,_html):         
        self.contenido=_html
    def serrarTor(self):
         self.zLibraty.close()
    def datosEsperaDescarga(self):
        sleep(4)
        self.htmlValidador=self.zLibraty.page_source
    def validarDescarga(self):
        self.htmlFalce=self.zLibraty.page_source
        self.soupFalce=BeautifulSoup(self.htmlFalce,"html.parser")
        self.validarfalce=self.soupFalce.find_all("description",class_="downloadDetails downloadDetailsNormal")
        self.respuestafalce=re.search("value=.+",str(self.validarfalce))
        self.buscarFalse=self.respuestafalce.group()
        if re.search("Canceled",self.buscarFalse):
            print("se daño al descarga =(")
            sleep(5)
            pyautogui.click(1393,139)
            sleep(5)
        else :
            if re.search("Failed",self.buscarFalse):
                print("se daño al descarga pero vamos a solucionarlo =( ")
                sleep(5)
                pyautogui.click(1393,139)
                sleep(5)
            else:    
                print("la descarga va bien =)")
    def peticiones(self):   
        self.validarDescarga()      
        self.carga=0
        self.daño=0
        self.conteo=0
        while self.carga<100:
            self.soup=BeautifulSoup(self.htmlValidador,"html.parser")
            try:
                self.archivoDescarga=self.soup.find_all("progress",class_="downloadProgress")
                self.respaldo=re.split("value",str(self.archivoDescarga))
                self.tiempo=re.search("[0-9]+",self.respaldo[1])
                print(self.tiempo.group())
                self.carga=int(self.tiempo.group())
                self.datosEsperaDescarga()
                sleep(3)
                self.validarDescarga()
                if self.conteo==3:
                    pyautogui.press("enter")
                    self.conteo=0
            except:
                print("o  no ,se daño la descargar y no la e podido volver a iniciar")
                if self.daño==7:
                    os.system('rm -r /home/dd/zlibros/libros1920-1921/libro/*.*')         
                    raise
                self.daño+=1
                sleep(5)