def rapidVideoHandler(self, rvLink): self.server.kill() self.server = webkit_server.Server() server_conn = webkit_server.ServerConnection(server=self.server) driver = dryscrape.driver.webkit.Driver(connection=server_conn) browser2 = dryscrape.Session(base_url = rvLink, driver = driver) browser2.set_attribute('auto_load_images', False) browser2.visit(rvLink) cicle = True count = 0 while cicle: print("try n. " + str(count+1) + ":") browser2.visit(rvLink) sleep(2) links = re.findall('"((http|ftp)s?://.*?)"', browser2.body()) for x in links: x = str(x) splitted1 = x.split(",") splitted2 = splitted1[0].split("'") if ".mp4" in splitted2[1]: print(splitted2[1]) self.server.kill() return splitted2[1] count = count + 1 print("failed") if count>=10: cicle = False print("Error finding mp4 Link")
def __init__(self, object): self.server = webkit_server.Server() server_conn = webkit_server.ServerConnection(server=self.server) driver = dryscrape.driver.webkit.Driver(connection=server_conn) self.browser = dryscrape.Session(base_url="https://otakustream.tv", driver=driver) self.browser.set_attribute('auto_load_images', False) self.browser.visit(object) self.root = html.fromstring(self.browser.body())
def get_html(urlQ, callback, xpath_hooks): """ This page takes a url from the URL Queue (urlQ) and calls a callbac that will handle the page source. xpage_hooks is a list used to determine when the page is loaded, see the docs for more details (e.g. ["//div[@data-test='whatever']"] ). """ svr = webkit_server.Server() svrconn = webkit_server.ServerConnection(server=svr) driver = dryscrape.driver.webkit.Driver(connection=svrconn) sess = dryscrape.Session(driver=driver) sess.set_header( "User-Agent", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" ) sess.set_attribute("auto_load_images", False) valid_page_func = lambda: any( sess.at_xpath(xpath) for xpath in xpath_hooks) session = Session() while not urlQ.empty(): url = urlQ.get() try: sess.visit(url) except webkit_server.InvalidResponseError: LOGGER.error( "Got invalid response from something? Skipping {}".format(url)) continue try: sess.wait_for(valid_page_func, interval=1, timeout=15) except dryscrape.mixins.WaitTimeoutError: LOGGER.error("Timeout so skipping {}".format(url)) continue response = sess.body() callback(session, url, response) sess.reset() svr.kill() session.close()
def __init__(self, object): self.server = webkit_server.Server() server_conn = webkit_server.ServerConnection(server=self.server) driver = dryscrape.driver.webkit.Driver(connection=server_conn) self.baseUrl = "https://otakustream.tv" self.browser = dryscrape.Session(base_url = self.baseUrl, driver = driver) self.browser.set_attribute('auto_load_images', False) self.browser.set_attribute('javascript_can_open_windows', False) self.browser.set_attribute('plugins_enabled', False) x = object.split("/") a = x.index("otakustream.tv") del x[0:a+1] epPage = '/'.join(x) epPage = "/" + epPage self.browser.visit(epPage)
def oLoadHandler(self, rvLink): self.server.kill() self.server = webkit_server.Server() server_conn = webkit_server.ServerConnection(server=self.server) driver = dryscrape.driver.webkit.Driver(connection=server_conn) browser2 = dryscrape.Session(base_url = rvLink, driver = driver) browser2.set_attribute('auto_load_images', False) browser2.set_attribute('javascript_can_open_windows', False) browser2.set_attribute('plugins_enabled', False) browser2.visit(rvLink) overlay = browser2.at_xpath('//*[@id="videooverlay"]') overlay.click() print("clicked overlay") a = browser2.at_xpath('//*[@id="olvideo_html5_api"]') finalLink = "https://oload.site" + a['src'] print(finalLink) self.server.kill() return finalLink
import dryscrape import webkit_server # Pagina principale da cui fare scraping url = 'http://iws.mx/dnd/?list' # URL di base per le varie pagine baseUrl = 'http://iws.mx/dnd/' # Lista di categorie da ignorare categorieDaSaltare = ['Everything', 'Glossary'] categorieEstratte = [] # Configurazione del server in modo da evitare memory leak dryscrape.start_xvfb() server = webkit_server.Server() server_conn = webkit_server.ServerConnection(server=server) driver = dryscrape.driver.webkit.Driver(connection=server_conn) session = dryscrape.Session(driver=driver) def main(): # Avvio la sessione session = dryscrape.Session() # Apro la pagina session.visit(url) # Attendo il caricamento print("[", time.asctime(time.localtime(time.time())), "] Attendo che venga caricata la pagina ", url) time.sleep(15) # Serve a non far crashare (vedi FIXME seguente) # Recupero il corpo della richiesta, cioé il codice della pagina