Example #1
0
    def rapidVideoHandler(self, rvLink):

        self.server.kill()
        self.server = webkit_server.Server()
        server_conn = webkit_server.ServerConnection(server=self.server)
        driver = dryscrape.driver.webkit.Driver(connection=server_conn)

        browser2 = dryscrape.Session(base_url = rvLink, driver = driver)
        browser2.set_attribute('auto_load_images', False)
        browser2.visit(rvLink)
        cicle = True
        count = 0

        while cicle:

            print("try n. " + str(count+1) + ":")
            browser2.visit(rvLink)
            sleep(2)

            links = re.findall('"((http|ftp)s?://.*?)"', browser2.body())
            for x in links:
                x = str(x)
                splitted1 = x.split(",")
                splitted2 = splitted1[0].split("'")
                if ".mp4" in splitted2[1]:
                    print(splitted2[1])
                    self.server.kill()
                    return splitted2[1]
            count = count + 1
            print("failed")
            if count>=10:
                cicle = False

        print("Error finding mp4 Link")
Example #2
0
    def __init__(self, object):

        self.server = webkit_server.Server()
        server_conn = webkit_server.ServerConnection(server=self.server)
        driver = dryscrape.driver.webkit.Driver(connection=server_conn)

        self.browser = dryscrape.Session(base_url="https://otakustream.tv",
                                         driver=driver)
        self.browser.set_attribute('auto_load_images', False)

        self.browser.visit(object)

        self.root = html.fromstring(self.browser.body())
Example #3
0
def get_html(urlQ, callback, xpath_hooks):
    """
    This page takes a url from the URL Queue (urlQ) and
    calls a callbac that will handle the page source.

    xpage_hooks is a list used to determine when the page is loaded,
    see the docs for more details (e.g. ["//div[@data-test='whatever']"] ).
    """
    svr = webkit_server.Server()
    svrconn = webkit_server.ServerConnection(server=svr)
    driver = dryscrape.driver.webkit.Driver(connection=svrconn)

    sess = dryscrape.Session(driver=driver)
    sess.set_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
    )
    sess.set_attribute("auto_load_images", False)

    valid_page_func = lambda: any(
        sess.at_xpath(xpath) for xpath in xpath_hooks)
    session = Session()

    while not urlQ.empty():
        url = urlQ.get()

        try:
            sess.visit(url)
        except webkit_server.InvalidResponseError:
            LOGGER.error(
                "Got invalid response from something? Skipping {}".format(url))
            continue

        try:
            sess.wait_for(valid_page_func, interval=1, timeout=15)
        except dryscrape.mixins.WaitTimeoutError:
            LOGGER.error("Timeout so skipping {}".format(url))
            continue

        response = sess.body()
        callback(session, url, response)
        sess.reset()

    svr.kill()
    session.close()
Example #4
0
    def __init__(self, object):

        self.server = webkit_server.Server()
        server_conn = webkit_server.ServerConnection(server=self.server)
        driver = dryscrape.driver.webkit.Driver(connection=server_conn)

        self.baseUrl = "https://otakustream.tv"
        self.browser = dryscrape.Session(base_url = self.baseUrl, driver = driver)
        self.browser.set_attribute('auto_load_images', False)
        self.browser.set_attribute('javascript_can_open_windows', False)
        self.browser.set_attribute('plugins_enabled', False)

        x = object.split("/")
        a = x.index("otakustream.tv")
        del x[0:a+1]
        epPage = '/'.join(x)
        epPage = "/" + epPage

        self.browser.visit(epPage)
Example #5
0
    def oLoadHandler(self, rvLink):

        self.server.kill()
        self.server = webkit_server.Server()
        server_conn = webkit_server.ServerConnection(server=self.server)
        driver = dryscrape.driver.webkit.Driver(connection=server_conn)

        browser2 = dryscrape.Session(base_url = rvLink, driver = driver)
        browser2.set_attribute('auto_load_images', False)
        browser2.set_attribute('javascript_can_open_windows', False)
        browser2.set_attribute('plugins_enabled', False)
        browser2.visit(rvLink)
        overlay = browser2.at_xpath('//*[@id="videooverlay"]')
        overlay.click()
        print("clicked overlay")
        a = browser2.at_xpath('//*[@id="olvideo_html5_api"]')
        finalLink = "https://oload.site" + a['src']
        print(finalLink)
        self.server.kill()
        return finalLink
Example #6
0
from bs4 import BeautifulSoup
import dryscrape
import webkit_server

# Pagina principale da cui fare scraping
url = 'http://iws.mx/dnd/?list'
# URL di base per le varie pagine
baseUrl = 'http://iws.mx/dnd/'

# Lista di categorie da ignorare
categorieDaSaltare = ['Everything', 'Glossary']
categorieEstratte = []

# Configurazione del server in modo da evitare memory leak
dryscrape.start_xvfb()
server = webkit_server.Server()
server_conn = webkit_server.ServerConnection(server=server)
driver = dryscrape.driver.webkit.Driver(connection=server_conn)
session = dryscrape.Session(driver=driver)


def main():
    # Avvio la sessione
    session = dryscrape.Session()
    # Apro la pagina
    session.visit(url)
    # Attendo il caricamento
    print("[", time.asctime(time.localtime(time.time())),
          "] Attendo che venga caricata la pagina ", url)
    time.sleep(15)  # Serve a non far crashare (vedi FIXME seguente)