Beispiel #1
0
 def __init__(self, _server, _output_dir, _search, _first_page, _last_page):
     self.server = _server
     self.output_dir = _output_dir
     self.search = _search
     self.first_page = _first_page
     self.last_page = _last_page
     self.i = 1
     self.j = 1
     self.Conection = Conection()
     self.jsonfile = JsonFile("data")
     self.list_link = []
     self.ydl_opts = {}
Beispiel #2
0
class OcrReader:
    def __init__(self, _root):
        self.root = _root
        self.qt_arq = 0
        self.qt_lida = 0
        self.porcento = 0
        self.current_data = JsonFile("current_data")

    def generate_ocr(
        self, current_dir
    ):  # d = diretorio imagem #n = nome imagem #c = caminho do arquivo de imagem
        texto = ocr.image_to_string(Image.open(current_dir), lang='por')
        arq = open(current_dir + ".txt", "w")
        arq.write(texto)
        arq.close()

    def get_quantity(self, img_extension):
        self.qt_arq = 0
        for path, subdirs, files in os.walk(self.root):
            for name in files:
                if fnmatch(name, '*.' + img_extension
                           ):  # se nome do arquivo do subdir tiver a extensao
                    current_dir = os.path.join(path, name)
                    if os.path.isfile(
                            current_dir + ".txt"
                    ) == 0:  # se nao existir o .txt com texto, cria
                        self.qt_arq += 1
                        print(img_extension + ' file foud:' + current_dir)
        return self.qt_arq

    def generate_all(self, img_extension):
        self.qt_lida = 0
        if self.qt_arq == 0:
            self.qt_arq = self.get_quantity(img_extension)
        for path, subdirs, files in os.walk(self.root):
            for name in files:
                if fnmatch(name, '*.' + img_extension
                           ):  # se nome do arquivo do subdir tiver a extensao
                    current_dir = os.path.join(path, name)
                    if os.path.isfile(
                            current_dir + ".txt"
                    ) == 0:  # se nao existir o .txt com texto, cria
                        self.qt_lida += 1
                        print("Converting: " + "[" + str(self.qt_lida) +
                              " from " + str(self.qt_arq) + "]" + current_dir)
                        self.generate_ocr(current_dir)
                        self.current_data.json_img_txt_details_conversion(
                            self.qt_arq, self.qt_lida, current_dir)
Beispiel #3
0
 def __init__(self, _root):
     self.root = _root
     self.qt_arq = 0
     self.qt_lida = 0
     self.porcento = 0
     self.current_data = JsonFile("current_data")
Beispiel #4
0
class Downloader:
    def __init__(self, _server, _output_dir, _search, _first_page, _last_page):
        self.server = _server
        self.output_dir = _output_dir
        self.search = _search
        self.first_page = _first_page
        self.last_page = _last_page
        self.i = 1
        self.j = 1
        self.Conection = Conection()
        self.jsonfile = JsonFile("data")
        self.list_link = []
        self.ydl_opts = {}

    def download(self):
        self.list_link = self.get_list_link()
        if self.list_link != 0:
            for j in range(0, len(self.list_link)):
                self.jsonfile.json_details_write(len(self.list_link), j,
                                                 self.list_link[j])
                self.ydl_opts = {
                    'outtmpl': self.output_dir + '\%(title)s.%(ext)s'
                }
                with youtube_dl.YoutubeDL(self.ydl_opts) as ydl:
                    ydl.download([self.list_link[j]])

    def get_list_link(self):
        self.list_link = []
        if self.Conection.get_status():
            # Test witch server will scan
            if self.server == 'xvideos':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://www.xvideos.com/?k=' + self.search +
                            '&p=' + str(self.i)))
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for div in soup.find_all(
                            class_='thumb'
                    ):  # thumb é a classe de div que contem os links
                        link = "https://www.xvideos.com" + str(
                            div.find('a')['href'])
                        self.list_link.append(link)
            elif self.server == 'eporner':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://www.eporner.com/search/' + self.search +
                            '/' + str(self.i)) + '/')
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for div in soup.find_all(
                            class_='mb'
                    ):  # mb é a classe de div que contem os links
                        link = "https://www.eporner.com" + str(
                            div.find('a')['href'])
                        self.list_link.append(link)
            elif self.server == 'gotporn':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://www.gotporn.com/results?search_query=' +
                            self.search + '&page=' + str(self.i)))
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for li in soup.find_all(
                            class_='video-item'
                    ):  # thumb é a classe de div que contem os links
                        link = str(li.find('a')['href'])
                        self.list_link.append(link)
            elif self.server == 'p**n':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://www.p**n.com/videos/search?q=' +
                            self.search + '&p=' + str(self.i)))
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for div in soup.find_all(
                            class_='thumb'
                    ):  # thumb é a classe de div que contem os links
                        link = "https://www.p**n.com/videos" + str(
                            div.find('a')['href'])
                        self.list_link.append(link)
            elif self.server == 'pornhd':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://www.pornhd.com/popular/' + self.search +
                            '?page=' + str(self.i)))
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for div in soup.find_all(
                            class_='thumb videoThumb popTrigger'
                    ):  # thumb videoThumb popTrigger é a classe de div que contem os links
                        link = "https://www.pornhd.com" + str(div['href'])
                        self.list_link.append(link)
            elif self.server == 'pornhub':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://pt.pornhub.com/video/search?search=' +
                            self.search + '&page=' + str(self.i)))
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for div in soup.find_all(
                            class_='img fade fadeUp videoPreviewBg'
                    ):  # img fade videoPreviewBg fadeUp é a classe de div que contem os links
                        link = "https://pt.pornhub.com" + str(
                            div.find('a')['href'])
                        self.list_link.append(link)
            elif self.server == 'redtube':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://www.redtube.com/?search=' + self.search +
                            '&page=' + str(self.i)))
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for span in soup.find_all(
                            class_='video_thumb_wrap'
                    ):  # video-box four-column video_block_wrapper é a classe de div que contem os links
                        link = "https://www.redtube.com" + str(
                            span.find('a')['href'])
                        self.list_link.append(link)
            elif self.server == 'spankbang':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://br.spankbang.com/s/' + self.search + '/' +
                            str(self.i)) + "/")
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for div in soup.find_all(
                            class_='video-item'
                    ):  # video-item é a classe de div que contém os links
                        link = "https://br.spankbang.com" + str(
                            div.find('a')['href'])
                        self.list_link.append(link)
            elif self.server == 'xhamster':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://pt.xhamster.com/search?q=' + self.search +
                            '&p=' + str(self.i)))
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for div in soup.find_all(
                            class_='video-thumb-info'
                    ):  # video-thumb-info é a classe de div que contem os links
                        link = str(div.find('a')['href'])
                        self.list_link.append(link)
            elif self.server == 'youporn':
                for self.i in range(self.first_page, self.last_page):
                    response = requests.get(
                        str('https://www.youporn.com/search/?query=' +
                            self.search + '&page=' + str(self.i)))
                    soup = bs4.BeautifulSoup(response.text, "html.parser")
                    for div in soup.find_all(
                            class_='video-box four-column video_block_wrapper'
                    ):  # video-box four-column video_block_wrapper é a classe de div que contem os links
                        link = "https://www.youporn.com" + str(
                            div.find('a')['href'])
                        self.list_link.append(link)
                return self.list_link
            elif self.server == 'beeg':
                url = str('https://www.beeg.com/tag/' + self.search)
                options = webdriver.ChromeOptions()
                options.add_argument('--ignore-certificate-errors')
                options.add_argument("--test-type")
                options.binary_location = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"
                driver = webdriver.Chrome(chrome_options=options)
                try:
                    driver.get(url)
                    print("Aguardando carregamento em 20s ...")
                    time.sleep(20)
                    html = driver.page_source
                    soup = bs4.BeautifulSoup(html, "html.parser")
                    for div in soup.find_all(
                            class_='thumb-unit'
                    ):  # thumb-unit é a classe de div que contem os links
                        link = "https://beeg.com" + str(div.find('a')['href'])
                        self.list_link.append(link)
                    return self.list_link
                except TimeoutException:
                    print("Tempo de carregamento esgotado!")
                except:
                    os.system("cls")
                    print(
                        "Tempo limite para o carregamento da página esgotado.")
                    print("Tente novamente!")
            else:
                print("Server not found.")
                self.list_link = []
            return self.list_link
        else:
            print("No conection... ")
            return 0