def __init__( self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db = self.host_db, user_db = self.user_db, password_db = self.password_db, database = self.database_name, ) self.source = 'Discover Dark Web Hidden Service' logging.basicConfig(level=logging.INFO) compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:DiscoverDarkWebService') self.session = requests.session() self.proxies = { 'http': 'socks5h://localhost:9050', }
def __init__( self, file=None, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db = self.host_db, user_db = self.user_db, password_db = self.password_db, database = self.database_name, ) self.source = 'Pastebin' logging.basicConfig(level=logging.INFO) compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:ExternalListAPI') self.file = file
def __init__(self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.source = 'Alt OnionDir' compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:AltOnionDir') self.session = requests.session() self.proxies = { 'http': 'socks5h://localhost:9050', }
def start_StartList(self): if os.path.isfile(self.args.list): if self.args.ignoredate: database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, ) self.logger.info( ' Iniciando o processo de Crawler por arquivo de lista.') self.logger.info( ' VOCÊ ESTÁ IGNORANDO A ULTIMA VEZ VISTA DA URL.') tor = TorConnect(telegram_chat_id=self.telegram_chat_id, telegram_token=self.telegram_token, host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, list=True, list_file=self.args.list, ignoredate=True) tor.start else: database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, ) self.logger.info( ' Iniciando o processo de Crawler por arquivo de lista.') tor = TorConnect( telegram_chat_id=self.telegram_chat_id, telegram_token=self.telegram_token, host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, list=True, list_file=self.args.list, ) tor.start else: self.logger.error( ' O arquivo {file} não existe ou o caminho está incorreto, verifique e tente novamente.\n\n' .format(file=self.args.list)) exit(1)
def __init__(self, alone=None, host_db=None, user_db=None, password_db=None, database=None, telegram_chat_id=None, telegram_token=None, order=None, number=None, list=None, list_file=None, ignoredate=None): self.ignoredate = ignoredate self.number = number self.order = order self.list = list self.list_file = list_file self.host_db = host_db self.user_db = user_db self.password_db = password_db self.telegram_chat_id = telegram_chat_id self.telegram_token = telegram_token self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.logger = logging.getLogger('Class:TorConnect') self.telegram = Telegram() self.date = datetime.now() self.session = requests.session() self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' ] self.proxies = { 'http': 'socks5h://localhost:9050', } self.alone = alone
def start_StartCrawler(self): database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, ) database.clear() self.logger.info(' Iniciando o processo de Crawler') tor = TorConnect( telegram_chat_id=self.telegram_chat_id, telegram_token=self.telegram_token, host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, ) tor.start
def __init__(self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.source = 'TORCH' logging.basicConfig(level=logging.INFO) compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:TORCH') self.session = requests.session() self.proxies = { 'http': 'socks5h://localhost:9050', } self.url = 'http://xmh57jrzrnw6insl.onion' self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' ]
def __init__( self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db = self.host_db, user_db = self.user_db, password_db = self.password_db, database = self.database_name, ) self.logger = logging.getLogger('Class:CyberSecurityNews') # TODO: QuickStart logging.basicConfig(level=logging.INFO) self.source = 'CyberSecurityNews-Pastebin' compare_sorce = self.database.compare_source(source=self.source) self.session = requests.session() if compare_sorce: pass else: self.database.save_source(source=self.source) # TODO: Arquivo de configuração self.argument = '.onion' self.url = 'https://pastebin.com/u/cybersecuritynews/1' self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
def __init__( self, host_db=None, user_db=None, password_db=None, database=None, api_key=None, cse_id=None, ): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.api_key = api_key self.cse_id = cse_id self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.logger = logging.getLogger('Class:GoogleAPI') self.source = 'Google' compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.session = requests.session() self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' ]
def __init__(self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.logger = logging.getLogger('Class:GistAPI') # TODO: QuickStart logging.basicConfig(level=logging.INFO) self.source = 'gist' compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) # TODO: Arquivo de configuração self.argument = '.onion' self.url = 'https://gist.github.com/search?l=Text&q=' self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' ]
def start_StartCrawler_order(self, number=None): if number is not None: database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, ) self.logger.info(' Iniciando o processo de Crawler') tor = TorConnect( telegram_chat_id=self.telegram_chat_id, telegram_token=self.telegram_token, host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, order=True, number=number, ) tor.start
def start(self): self.header = header() self.header.start start_framework = None if self.args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig( level=logging.INFO, format= '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', ) logging.getLogger('googleapicliet.discovery_cache').setLevel( logging.ERROR) if self.args.clear: database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database, ) database.clear() if self.args.initial: self.initialdb() if self.args.StartList: self.logger = logging.getLogger('StartListURL') self.start_StartList() if self.args.StartGet: self.logger = logging.getLogger('GetStStartGetart') if self.args.framework is not None: self.start_StartGet(argument=self.args.framework) else: self.logger.error( ' Você precisa informar o framework que deseja usar\nEx: -Sc --framework github,gist\n\n' ) exit(0) if self.args.StartImport: self.logger = logging.getLogger('StartImport') if self.args.imput is not None: self.start_StartImport(argument=self.args.imput) else: self.logger.error( ' Você precisa informar o arquivo que deseja importar\nEx: -Si --imput /root/file.txt\n\n' ) exit(0) if self.args.StartCrawler: self.logger = logging.getLogger('StartCrawler') self.start_StartCrawler() if self.args.StartCrawlerURL: self.logger = logging.getLogger('StartCrawlerURL') self.start_StartCrawler_alone() if self.args.StartOrder: self.logger = logging.getLogger('StartCrawlerURLDESC') try: if isinstance(int(self.args.order), int): self.start_StartCrawler_order(number=int(self.args.order)) except ValueError as e: self.logger.error( ' Você precisa informar um número\nEx: -So --order 3\n\n') exit(1)
class GistAPI: def __init__(self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.logger = logging.getLogger('Class:GistAPI') # TODO: QuickStart logging.basicConfig(level=logging.INFO) self.source = 'gist' compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) # TODO: Arquivo de configuração self.argument = '.onion' self.url = 'https://gist.github.com/search?l=Text&q=' self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' ] # Seleciona um agent aleatório de acordo com a lista. @property def random_headers(self): return { 'User-Agent': choice(self.desktop_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } @property def start(self): #self.database.replaces() self.cookies() self.pagination() self.scraping() self.raw() def cookies(self): self.logger.info(' Iniciando Scrap no Gist.') with requests.Session() as self.session: self.headers = self.random_headers request = self.session.get(self.url + self.argument, headers=self.headers) if request.status_code == 200: pass else: GistAPI.start def pagination(self): # Converte o keyword para a leitura de URL. self.query = urllib.parse.quote(self.argument) full_url = self.url + self.argument self.logger.info(' Conectando em {}'.format(full_url)) time.sleep(5) request = self.session.get(full_url, headers=self.headers) self.soup = BeautifulSoup(request.content, features="lxml") pages = [] self.urls = [full_url] # Verifica se existe mais de uma página no resultado de pesquisa. try: for pagination in self.soup.find('div', { 'class': 'pagination' }).findAll('a'): pages.append(pagination.get_text()) except: pages = False # Se caso tiver mais de uma pagina de resultado, será criada uma lista com todos os resultados. if pages: cont = 2 while cont <= int(pages[-2]): cont += 1 full_url = 'https://gist.github.com/search?l=Text&p={pagination}&q={query}'.format( query=self.query, pagination=cont - 1) self.urls.append(full_url) def scraping(self): # Inicia o scraping em URL por URL url = [] for inurl in self.urls: self.logger.info(' Conectando em {}'.format(inurl)) time.sleep(5) request = self.session.get(inurl, headers=self.headers) if request.status_code == 200: soup = BeautifulSoup(request.content, features="lxml") for code in soup.findAll('div', {'class': 'gist-snippet'}): if self.argument in code.get_text().lower(): for raw in code.findAll('a', {'class': 'link-overlay'}): try: url.append(raw['href']) except: pass self.urls_raw = [] for get in url: self.logger.info(' Conectando em {}'.format(get)) time.sleep(5) try: request = self.session.get(get, headers=self.headers) if request.status_code == 200: soup = BeautifulSoup(request.content, features="lxml") for raw in soup.findAll('a', {'class': 'btn btn-sm'}): try: gist_url = "{url}{gist}".format( url="https://gist.githubusercontent.com", gist=raw['href']) self.urls_raw.append(gist_url) except: pass except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error( ' Não consegui conectar na url, porque ocorreu um erro.\n{e}' .format(e=e)) pass def raw(self): self.logger.info(' Realizando os replaces e regex. AGUARDE...') itens = [] for raw in self.urls_raw: if '.txt' in raw.lower(): time.sleep(5) request = self.session.get(raw, headers=self.headers) self.soup = BeautifulSoup(request.content, features="lxml") for pre in self.soup.findAll('body'): list = pre.get_text().split('\n') itens.extend(list) regex = re.compile( "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in itens: rurls = lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') url = regex.match(rurls) if url is not None: self.database.saveonion(url=url.group(), source=self.source)
class CyberSecurityNews: def __init__( self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db = self.host_db, user_db = self.user_db, password_db = self.password_db, database = self.database_name, ) self.logger = logging.getLogger('Class:CyberSecurityNews') # TODO: QuickStart logging.basicConfig(level=logging.INFO) self.source = 'CyberSecurityNews-Pastebin' compare_sorce = self.database.compare_source(source=self.source) self.session = requests.session() if compare_sorce: pass else: self.database.save_source(source=self.source) # TODO: Arquivo de configuração self.argument = '.onion' self.url = 'https://pastebin.com/u/cybersecuritynews/1' self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0'] # Seleciona um agent aleatório de acordo com a lista. @property def random_headers(self): return { 'User-Agent': choice(self.desktop_agents), 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } @property def start(self): self.database.replaces() self.pages() def pages(self): self.headers = self.random_headers self.logger.info(' Conectando em {}'.format(self.url)) time.sleep(2) request = self.session.get(self.url, headers=self.headers) if request.status_code == 200: soup = BeautifulSoup(request.content, features="lxml") pages_to_pages = [] for raw in soup.find('div', {'class': 'pagination'}).findAll('a'): pages_to_pages.append(raw.get_text()) cont = 2 pages_urls = [self.url] while cont <= int(pages_to_pages[-2]): cont +=1 pages_urls.append("https://pastebin.com/u/cybersecuritynews/{}".format(cont-1)) raw_urls = [] for get_urls in pages_urls: self.logger.info(' Conectando em {}'.format(get_urls)) request = self.session.get(get_urls, headers=self.headers) if request.status_code == 200: soup = BeautifulSoup(request.content, features="lxml") for raw in soup.find('table', {'class':'maintable'}).findAll('a'): if 'archive' in raw['href']: pass else: raw_urls.append("https://pastebin.com/raw{}".format(raw['href'])) itens = [] self.logger.info(' Realizando os replaces e regex. AGUARDE...') for raw in raw_urls: request = self.session.get(raw, headers=self.headers) self.soup = BeautifulSoup(request.content, features="lxml") for pre in self.soup.findAll('body'): list = pre.get_text().split('\n') itens.extend(list) regex = re.compile("[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in itens: rurls = lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') url = regex.match(rurls) if url is not None: self.database.saveonion( url=url.group(), source=self.source)
class AltOnionDir: def __init__(self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.source = 'Alt OnionDir' compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:AltOnionDir') self.session = requests.session() self.proxies = { 'http': 'socks5h://localhost:9050', } @property def start(self): self.database.replaces() self.alt_onionDir() def alt_onionDir(self): url = 'http://onionf3ck2i74bmm.onion' self.logger.info(' Conectando em {url}'.format(url=url)) request = self.session.get(url, proxies=self.proxies, timeout=1000) soup = BeautifulSoup(request.content, features="lxml") pages = [] for raw in soup.find('navbar', {'id': 'content-navbar'}).findAll('a'): if '.html' in raw['href'].lower(): pages.append("{url}/{page}".format(url=url, page=raw['href'])) for urls in pages: try: request = self.session.get(urls, proxies=self.proxies, timeout=1000) soup = BeautifulSoup(request.content, features="lxml") next = [] for paginator in soup.find('ul', { 'id': 'paginator' }).findAll('a'): next.append("{url}/{page}".format( url=url, page=paginator['href'].replace('..', ''))) for nextpage in next: self.logger.info( ' Realizando scraping em {url}'.format(url=nextpage)) try: request = self.session.get(nextpage, proxies=self.proxies, timeout=1000) soup = BeautifulSoup(request.content, features="lxml") list_urls = [] for raw in soup.find('div', { 'class': 'generic-page' }).findAll('footer'): for get_onion in raw.findAll('a'): list_urls.append(get_onion['href']) regex = re.compile( "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in list_urls: rurls = lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') xurl = regex.match(rurls) if xurl is not None: self.database.saveonion(url=xurl.group(), source=self.source) except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error( ' Não consegui conectar na url, porque ocorreu um erro.\n{e}' .format(e=e)) pass except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error( ' Não consegui conectar na url, porque ocorreu um erro.\n{e}' .format(e=e)) pass
class DiscoverDarkWebService: def __init__( self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db = self.host_db, user_db = self.user_db, password_db = self.password_db, database = self.database_name, ) self.source = 'Discover Dark Web Hidden Service' logging.basicConfig(level=logging.INFO) compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:DiscoverDarkWebService') self.session = requests.session() self.proxies = { 'http': 'socks5h://localhost:9050', } @property def start(self): self.database.replaces() self.discover_dark_web() def discover_dark_web(self): url = 'http://3bbaaaccczcbdddz.onion/discover' self.logger.info(' Conectando em {url}'.format(url=url)) try: request = self.session.get(url, proxies=self.proxies, timeout=1000) soup = BeautifulSoup(request.content, features="lxml") list_urls = [] for raw in soup.find('table', {'class': 'table'}).findAll('a'): list_urls.append(raw['href'].replace('/search?q=', '')) except(requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error(' Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e)) pass self.logger.info(' Aplicando REGEX. Aguarde...') regex = re.compile("[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in list_urls: rurls = lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') xurl = regex.match(rurls) if xurl is not None: self.database.saveonion( url=xurl.group(), source=self.source)
class ExternalListAPI: def __init__( self, file=None, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db = self.host_db, user_db = self.user_db, password_db = self.password_db, database = self.database_name, ) self.source = 'Pastebin' logging.basicConfig(level=logging.INFO) compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:ExternalListAPI') self.file = file @property def start(self): self.database.replaces() self.getExternal() def getExternal(self): self.logger.info(' Fazendo comparação da lista de URLS com o banco de dados. AGUARDE..') with open(self.file , 'r') as outputfile: self.logger.info(' Aplicando REGEX. Aguarde...') regex = re.compile("[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in outputfile.readlines(): rurls = lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') xurl = regex.match(rurls) if xurl is not None: compare_sorce = self.database.compare_source(source=self.source) compare_url = self.database.compare_url(url=xurl.group()) if compare_url: self.logger.debug(' A url {url} já existe no banco de dados.'.format(url=xurl.group())) else: self.database.save_url(url=xurl.group(), source=compare_sorce[0][0])
class GoogleAPI: def __init__( self, host_db=None, user_db=None, password_db=None, database=None, api_key=None, cse_id=None, ): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.api_key = api_key self.cse_id = cse_id self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.logger = logging.getLogger('Class:GoogleAPI') self.source = 'Google' compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.session = requests.session() self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' ] @property def random_headers(self): return { 'User-Agent': choice(self.desktop_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } @property def start(self): self.urls() def google_search(self, search_term, **kwargs): service = build("customsearch", "v1", developerKey=self.api_key, cache_discovery=False) try: res = service.cse().list(q=search_term, cx=self.cse_id, **kwargs).execute() next_response = service.cse().list( q=search_term, cx=self.cse_id, num=10, start=3, ).execute() return res except: return None def text(self, url=None): if url is not None: try: request_pages = self.session.get('{}'.format(url), headers=self.random_headers, timeout=500) if request_pages.status_code == 200: soup = BeautifulSoup(request_pages.content, features="lxml") for s in soup(['script', 'style']): s.decompose() return ' '.join(soup.stripped_strings) except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError) as e: return None def urls(self): search = self.google_search('intext:.onion', num=10, start=1) if search is not None: number_pages_search = int( search['queries']['request'][0]['totalResults']) // 10 cont = 1 urls = [] while cont <= number_pages_search: cont += 1 search = self.google_search('intext:.onion', num=10, start=cont) if search is not None: for result in search: if 'items' in result: texts = [] for results in search[result]: texts.append( self.text(url=results['formattedUrl'])) if texts is not None: regex = re.compile( "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in texts: if lines is not None: for split_lines in lines.split(r' '): replace_urls = split_lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') url = regex.match(replace_urls) if url is not None: self.database.saveonion(url=url.group(), source=self.source)
class TORCH: def __init__(self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.source = 'TORCH' logging.basicConfig(level=logging.INFO) compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:TORCH') self.session = requests.session() self.proxies = { 'http': 'socks5h://localhost:9050', } self.url = 'http://xmh57jrzrnw6insl.onion' self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' ] # Seleciona um agent aleatório de acordo com a lista. @property def random_headers(self): return { 'User-Agent': choice(self.desktop_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } @property def start(self): self.pages() def pages(self): keywords = [ 'forum', 'press', 'search', 'introduction', 'arabic', 'chinese', 'french', 'german', 'italian', 'japanese', 'polish', 'portuguese', 'russians', 'Spanish', 'hardcore', 'softcore', 'erotica', 'fetish', 'violence', 'escorts', 'p**n', 'domains', 'file', 'pastebin', 'proxies', 'web', 'blog', 'books', 'bbs', 'chans', 'wiki', 'social', 'Social', 'activism', 'paranormal', 'politics', 'religion', 'whistleblowing', 'development', 'c++', 'c#', 'python', 'HTML', 'ruby', 'jupyter', 'java', 'javascript', 'java', 'hacker', 'blackbox', 'read', 'blackhat', 'cracked', 'wordlist', 'word', 'hacked', 'blueteam', 'Phishing', 'Malware', 'Lamer', 'Cracker', 'Defacer', 'Spyware', 'Scammers', 'DDOS', 'SQL', 'sql', 'Botnet', 'Exploit', 'Script', 'zero', '0day', 'zeroday', 'Cybersecurity', 'Cyber', 'Hacktivism', 'Hacktivist', 'Keylogger', 'Blacklist', 'ai', 'bitcoin', 'Equifax', 'Nessus', 'openvas', 'securitycenter', 'Truecrypt', 'ClamAV', 'OSSEC', 'paloalto', 'BackTrack', 'OSSIM', 'IPCop', 'Okta', 'sonicwall', 'pfsense', 'Metasploit', 'OpenSSH', 'Wireshark', 'NStealth', 'drugs', 'drug-shop', 'Acid', 'Asteroid', 'Berry', 'Poker', 'games', 'Multiplayer', 'Play', 'activism', 'Casino', '.mp3', '.mp4', 'Video', 'Filme', 'Movie', 'channel', 'message', 'conclusion', 'termination', 'heading', 'headline', 'english', 'mandarin', 'hindustani', 'arabic', 'malay', 'bengali', 'sex', 'sexy', 'sexo', 'sexual', 'LGBT', 'Abuse', 'local', 'ebook', 'ebooks', 'social', 'christianity', 'islam', 'nonreligious', 'secular', 'secular', 'agnostic', 'atheist', 'hinduism', 'buddhism', 'spiritism', 'judaism', 'primal-indigenous', 'php', 'visual', 'C++', 'delphi', 'pascal', 'cobol', 'Cyberark', 'Firewall', 'antivirus', 'marijuana', 'weed', 'cocaine', 'heroin', 'cannabis', 'crack', 'ecstasy', 'amphetamines', 'lsd', 'singleplayer', 'TV', 'television', 'radio', ] self.headers = self.random_headers self.logger.info(' Conectando em {}'.format(self.url)) urls = [] self.logger.info(' Gerando URLS') for term in keywords: cont = 0 while cont <= 9: cont += 1 url_page = "{url}/4a1f6b371c/search.cgi?cmd=Search!&fmt=url&form=extended&GroupBySite=no&m=all&np={number}&ps=50&q={term}&sp=1&sy=1&type=&ul=&wf=2221&wm=wrd" \ .format( url=self.url, number=cont, term=term) urls.append(url_page) self.logger.info( ' Conectando nas paginas, e coletando URLS. AGUARDE...') for number_pages in urls: self.logger.debug(' Conectando em {}'.format(number_pages)) try: request = self.session.get(number_pages, proxies=self.proxies, timeout=1000) if request.status_code == 200: soup = BeautifulSoup(request.content, features="lxml") regex = re.compile( "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for dt in soup.findAll('dt'): for dt_a in dt.findAll('a'): rurls = dt_a.get_text() \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') full_url = regex.match(rurls) if full_url is not None: self.database.saveonion(url=full_url.group(), source=self.source) except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error( ' Não consegui conectar na url, porque ocorreu um erro.\n{e}' .format(e=e)) pass
class UnderDir: def __init__(self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.source = 'UnderDir' logging.basicConfig(level=logging.INFO) compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:UnderDir') self.session = requests.session() self.proxies = { 'http': 'socks5h://localhost:9050', } @property def start(self): self.database.replaces() self.underdir() def underdir(self): url = 'http://underdj5ziov3ic7.onion' self.logger.info(' Conectando em {url}'.format(url=url)) request = self.session.get(url, proxies=self.proxies, timeout=1000) soup = BeautifulSoup(request.content, features="lxml") for raw in soup.find('div', { 'id': 'incore' }).findAll('div', {'class': 'fr_m'}): for category in raw.findAll('a'): url_list = "{url}{category}".format(category=category['href'], url=url) self.logger.info( ' Realizando scraping em {url}'.format(url=url_list)) request = self.session.get(url_list, proxies=self.proxies, timeout=1000) soup = BeautifulSoup(request.content, features='lxml') pages = [] for raw in soup.find('div', {'class': 'pgn'}).findAll('a'): pages.append(raw.get_text()) cont = 2 urls = [url_list] while cont <= int(pages[-2]): cont += 1 urls.append("{url}/pg/{number}".format(url=url_list, number=cont - 1)) for get in urls: self.logger.info(' Conectando em {url}.'.format(url=get)) try: request = self.session.get(get, proxies=self.proxies, timeout=1000) if request.status_code == 200: soup = BeautifulSoup(request.content, features='lxml') itens = [] for raw in soup.find('div', { 'class': 'trr' }).findAll('a'): itens.append(raw['href'].replace('http://', '')) regex = re.compile( "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in itens: rurls = lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') urls = regex.match(rurls) if urls is not None: self.database.saveonion(url=urls.group(), source=self.source) except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error( ' Não consegui conectar na url, porque ocorreu um erro.\n{e}' .format(e=e)) pass
class TorConnect: def __init__(self, alone=None, host_db=None, user_db=None, password_db=None, database=None, telegram_chat_id=None, telegram_token=None, order=None, number=None, list=None, list_file=None, ignoredate=None): self.ignoredate = ignoredate self.number = number self.order = order self.list = list self.list_file = list_file self.host_db = host_db self.user_db = user_db self.password_db = password_db self.telegram_chat_id = telegram_chat_id self.telegram_token = telegram_token self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) self.logger = logging.getLogger('Class:TorConnect') self.telegram = Telegram() self.date = datetime.now() self.session = requests.session() self.desktop_agents = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' ] self.proxies = { 'http': 'socks5h://localhost:9050', } self.alone = alone @property def headers(self): return { 'User-Agent': choice(self.desktop_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } @property def start(self): self.selection() def selection(self): lastsevendays = datetime.strptime( time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime()), '%Y-%m-%dT%H:%M:%S') - timedelta(days=7) lastfourteendays = datetime.strptime( time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime()), '%Y-%m-%dT%H:%M:%S') - timedelta(days=14) if self.list is not None: with open(self.list_file, 'r') as outputfile: self.logger.info(' Aplicando REGEX. Aguarde...') regex = re.compile( "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in outputfile.readlines(): rurls = lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') xurl = regex.match(rurls) if xurl is not None: externalURL = xurl.group() self.logger.debug( ' Comparando a URL digitada com o que está no banco de dados.' ) compare_url = self.database.compare_url( url=externalURL) if compare_url: self.logger.debug( ' A url {url} já existe no banco de dados.'. format(url=externalURL)) else: self.database.save_url(url=externalURL, source=1) for id, source_id, url, status, created_in, last_date in self.database.select_alone( alone=externalURL): if self.ignoredate: self.crawler(id=id, url=url) else: if status == 1: self.logger.info( ' A url {url} com o status de ONLINE já foi vista.' .format(url=url)) if last_date is not None: if last_date <= lastfourteendays: self.logger.info( ' Já faz mais de duas semanas que a url {url} com o status de ONLINE foi vista pela ultima vez, uma nova verificação será feita.' .format(url=url)) self.crawler(id=id, url=url) else: self.crawler(id=id, url=url) elif status == 0: if last_date is not None: if last_date <= lastsevendays: self.logger.info( ' Já faz mais de duas semanas que a url {url} com o status de OFFILINE foi vista pela ultima vez, uma nova verificação será feita.' .format(url=url)) self.crawler(id=id, url=url) else: self.logger.info( ' A url {url} nunca foi vista, uma tentativa será feita agora..' .format(url=url)) self.crawler(id=id, url=url) elif self.order: self.logger.info( ' Você selecionou a opção para ordenar pela coluna {number}.'. format(number=self.number)) for id, source_id, url, status, created_in, last_date in self.database.select( )[int(self.number)::int(self.number)]: if status == 1: self.logger.info( ' A url {url} já foi vista, será verificado a ultima vez que a mesma foi visitada.' .format(url=url)) if last_date is not None: if last_date <= lastfourteendays: self.logger.info( ' Já faz mais de duas semanas que a url {url} com o status de ONLINE foi vista pela ultima vez, uma nova verificação será feita.' .format(url=url)) self.crawler(id=id, url=url) else: self.crawler(id=id, url=url) elif status == 0: if last_date is not None: if last_date <= lastsevendays: self.logger.info( ' Já faz mais de duas semanas que a url {url} com o status de OFFILINE foi vista pela ultima vez, uma nova verificação será feita.' .format(url=url)) self.crawler(id=id, url=url) else: self.logger.info( ' A url {url} nunca foi vista, uma tentativa será feita agora..' .format(url=url)) self.crawler(id=id, url=url) elif self.alone is not None: self.logger.info(' Você selecionou o CRAWLER para apenas uma URL.') if '.onion' in self.alone: if len(self.alone.split(',')) == 1: self.logger.debug( ' Comparando a URL digitada com o que está no banco de dados.' ) compare_url = self.database.compare_url(url=self.alone) if compare_url: self.logger.debug( ' A url {url} já existe no banco de dados.'.format( url=self.alone)) else: self.database.save_url(url=self.alone, source=1) for id, source_id, url, status, created_in, last_date in self.database.select_alone( alone=self.alone): self.crawler(id=id, url=url) else: self.logger.info( ' Parece que você colocou mais de uma URL, será realizado CRAWLER, uma de cada vez.' ) for alones in self.alone.split(','): self.logger.debug( ' Comparando a URL digitada com o que está no banco de dados.' ) compare_url = self.database.compare_url(url=alones) if compare_url: self.logger.debug( ' A url {url} já existe no banco de dados.'. format(url=alones)) else: self.database.save_url(url=alones, source=1) for id, source_id, url, status, created_in, last_date in self.database.select_alone( alone=alones): self.crawler(id=id, url=url) else: self.logger.error( ' OPSS... Isso que você digitou não é uma url da rede TOR.\n\n\n' ) else: self.logger.info( ' Você selecionou o Crawçer padrão, seguindo pela ordem do ID.' ) for id, source_id, url, status, created_in, last_date in self.database.select( ): if status == 1: if last_date <= lastfourteendays: self.logger.info( ' Já faz mais de duas semanas que a url {url} com o status de ONLINE foi vista pela ultima vez, uma nova verificação será feita.' .format(url=url)) self.crawler(id=id, url=url) elif status == 0: if last_date <= lastfourteendays: self.logger.info( ' Já faz mais de duas semanas que a url {url} com o status de OFFILINE foi vista pela ultima vez, uma nova verificação será feita.' .format(url=url)) self.crawler(id=id, url=url) else: self.logger.info( ' A url {url} nunca foi vista, uma tentativa será feita agora..' .format(url=url)) self.crawler(id=id, url=url) def moreurls(self, url=None, default=None): self.logger.info(' Conectando em {}. Aguarde...'.format(url)) fullmoreurl = None if url is not None: replace_url = url.replace('http://', '').replace('\n', '').replace('\s', '') try: request = self.session.get('http://{}'.format(replace_url), proxies=self.proxies, headers=self.headers, timeout=500) if request.status_code == 200: pages = [] soup = BeautifulSoup(request.content, features="lxml") try: for raw in soup.find('body').findAll(): mosturl = str(raw.get('href')) if raw.get('href') is not None: if 'http://' in mosturl: if '.onion' in mosturl: if url in mosturl: fullmoreurl = mosturl.replace( 'http://', '') elif 'https://' in mosturl: if '.onion' in mosturl: if url in mosturl: fullmoreurl = mosturl.replace( 'https://', '') else: if ' ' in mosturl: pass elif "'" in mosturl: pass elif '"' in mosturl: pass elif '(' in mosturl: pass else: if '..' in mosturl: if default is not None: fullmoreurl = '{0}/{1}'.format(default, mosturl) \ .replace('//', '/') else: fullmoreurl = '{0}/{1}'.format(url, mosturl) \ .replace('//', '/') else: if default is not None: fullmoreurl = '{0}/{1}'.format(default, mosturl) \ .replace('//', '/') else: fullmoreurl = '{0}/{1}'.format(url, mosturl) \ .replace('//', '/') if fullmoreurl is not None: pages.append(fullmoreurl) except AttributeError as e: self.logger.error( ' OPSS... Parece que não tem texto nenhum nessa página.\n{e}' .format(e=e)) pass return pages else: self.logger.error(' Não consegui conectar na url') except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error( ' Não consegui conectar na url, porque ocorreu um erro.\n{e}' .format(e=e)) def crawler_text(self, url=None): try: if url is not None: request_pages = self.session.get('http://{}'.format(url), proxies=self.proxies, headers=self.headers, timeout=500) self.logger.debug(' Conectando em {url} - {status}'.format( url=url, status=request_pages.status_code)) if request_pages.status_code == 200: soup = BeautifulSoup(request_pages.content, features="lxml") #text = soup_page.findAll(text=True) for s in soup(['script', 'style']): s.decompose() return ' '.join(soup.stripped_strings) except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.TooManyRedirects) as e: pass class TimeoutError(Exception): pass def timeout(seconds=10, error_message=os.strerror(errno.ETIME)): def decorator(func): def _handle_timeout(signum, frame): raise TimeoutError(error_message) def wrapper(*args, **kwargs): signal.signal(signal.SIGALRM, _handle_timeout) signal.alarm(seconds) try: result = func(*args, **kwargs) finally: signal.alarm(0) return result return wraps(func)(wrapper) return decorator @timeout(30) def screenshot(self, namescreenshot=None, url=None): try: os.system( "google-chrome --headless --no-sandbox --disable-gpu --proxy-server=socks://127.0.0.1:9050 --screenshot=VigilantOnion/media/sites/cover/{namescreenshot}.png http://{url}" .format(namescreenshot=namescreenshot, url=url)) except TimeoutError: self.logger.error( ' Não foi possível realizar o screenshot da url {url}.'.format( url=url)) def crawler(self, id, url): text = [] key = None type = None status = 0 self.logger.info( ' Iniciando o Crawler na url {url}...'.format(url=url)) if url is not None: self.startscritp = time.time() namescreenshot = url.replace('.', '') self.logger.debug( ' Tentando conexão na url {url}, para os proximos passos.'. format(url=url)) try: request_pages = self.session.get('http://{}'.format(url), proxies=self.proxies, headers=self.headers, timeout=100) if request_pages.status_code == 200: self.logger.info( ' Fazendo uma screenshot da pagina inicial da url {url}.' .format(url=url)) self.screenshot(namescreenshot=namescreenshot, url=url) if self.moreurls(url=url) is not None: for pages in self.moreurls(url=url): url_pages = pages \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') \ .replace('//', '/') try: request_pages_more = self.session.get( 'http://{}'.format(url_pages), proxies=self.proxies, headers=self.headers, timeout=100) if request_pages_more.status_code == 200: if self.database.compare_more( url=url_pages): self.logger.debug( ' A url {url} já está no banco de dados.' .format(url=url_pages)) else: self.database.save_more(url=url_pages, status=1) if self.database.check_url_more_id( url_id=id, more_id=self.database. return_id_more( url=url_pages)[0][0]): self.logger.debug( ' A url {url} já está no banco de dados.' .format(url=url_pages)) else: self.database.save_url_more_id( url_id=id, more_id=self.database. return_id_more( url=url_pages)[0][0]) check_sub_pages = self.moreurls( url=url_pages, default=url) if check_sub_pages is not None: for sub_pages in check_sub_pages: url_pages_more = sub_pages \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') \ .replace('//', '/') try: request_pages_more = self.session.get( 'http://{}'.format( url_pages_more), proxies=self.proxies, headers=self.headers, timeout=100) if request_pages_more.status_code == 200: if self.database.compare_more( url=url_pages_more ): self.logger.debug( ' A url {url} já está no banco de dados.' .format( url= url_pages_more) ) else: self.database.save_more( url=url_pages_more, status=1) if self.database.check_url_more_id( url_id=id, more_id=self. database. return_id_more( url= url_pages_more )[0][0]): self.logger.debug( ' A url {url} já está no banco de dados.' .format( url= url_pages_more )) else: self.database.save_url_more_id( url_id=id, more_id=self. database. return_id_more( url= url_pages_more )[0][0]) else: self.logger.error( ' Por Algum motivo, não consegui conectar na URL {url}' .format( url=url_pages_more )) except (requests.exceptions. ConnectionError, requests.exceptions. ChunkedEncodingError, requests.exceptions. ReadTimeout, requests.exceptions. InvalidURL) as e: self.logger.error( ' Um erro ocorreu.\n\n{error}\n.' .format(error=e)) pass except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error( ' Um erro ocorreu.\n\n{error}\n'.format( error=e)) pass self.logger.info( ' Obtendo todas as informações, das páginas salvas.') if self.database.return_id_urls_more(id=id): for id_pages in self.database.return_id_urls_more( id=id): pagination = self.database.return_url_more( id=id_pages[0]) filelog = "/tmp/{}".format( url.replace('onion', 'lonion')) if not os.path.exists(filelog): arquivo = open(filelog, 'w', encoding="utf8") arquivo.close() text_crawler = str( self.crawler_text(url=pagination[0][0])) if text_crawler is not None: arquivo = open(filelog, 'r', encoding="utf8") conteudo = arquivo.readlines() conteudo.append(text_crawler) arquivo = open(filelog, 'w', encoding="utf8") arquivo.writelines(conteudo) arquivo.close() counter_category = collections.Counter() with open(filelog, 'r') as a: self.logger.info(' Definindo a categoria do site.') for linha in a: linha = linha.split('\n') for id_categorie, term in self.database.return_categorie_term( ): if term.lower() in str(linha).lower(): type = id_categorie counter_category[term] += 1 self.logger.info(' Procurando por keywords.') for linha in a: linha = linha.split('\n') for id_keyword, company, term in self.database.return_keyword( ): if term.lower() in str(linha).lower(): key = id_keyword self.database.save_search_keyword( url_id=id, company_id=company) break if key is not None: fim = time.time() self.database.save_categorie(id=id, status=1, type=type) alert = "KEYWORD:\n\nNew keyword:\nSite:{url}\nStatus: 200\nkeyword:{key}\nTime:{time}\n".format( url=url, key=key, time=int(fim - self.startscritp), ) self.telegram.send(alert) if type is not None: fim = time.time() cover = "sites/cover/{namescreenshot}.png".format( namescreenshot=namescreenshot) name_type = max(counter_category.most_common()) save_type = self.database.return_keyword_id( term=name_type[0])[0][0] self.database.save_categorie(id=id, status=1, type=int(save_type), cover=cover) alert = "New site with 200 return found:\nSite:{url}\nStatus: 200\nType:{type}\nTime:{time}\n".format( url=url, type=int(save_type), time=int(fim - self.startscritp), ) self.telegram.send(alert) os.remove(filelog) else: fim = time.time() cover = "sites/cover/{namescreenshot}.png".format( namescreenshot=namescreenshot) self.database.save_categorie(id=id, status=1, type=1, cover=cover) alert = "New site with 200 return found:\nSite:{url}\nStatus: 200\nType:{type}\nTime:{time}\n".format( url=url, type=1, time=int(fim - self.startscritp), ) self.telegram.send(alert) os.remove(filelog) else: fim = time.time() cover = "sites/cover/{namescreenshot}.png".format( namescreenshot=namescreenshot) self.database.save_categorie(id=id, status=1, type=1, cover=cover) alert = "New site with 200 return found:\nSite:{url}\nStatus: 200\nType:{type}\nTime:{time}\n".format( url=url, type=1, time=int(fim - self.startscritp), ) self.telegram.send(alert) else: self.logger.error( ' Por Algum motivo, não consegui conectar na URL, vou salvar como offline, para uma nova tentativa ser realizada, no proximo loop.' ) self.database.save_categorie_404(id=id, status=0) except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error( ' Um erro ocorreu.\n\n{error}\nPor conta desse erro vou salvar no banco de dados como offline.' .format(error=e)) self.database.save_categorie_404(id=id, status=0) else: self.logger.debug( ' Alguma URL entrou como None, melhor dar uma olhada no banco de dados.\n Talvez executar a limpeza funcione.' )
class FleshOnionsAPI: def __init__( self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db = self.host_db, user_db = self.user_db, password_db = self.password_db, database = self.database_name, ) self.source = 'FlashOnions' logging.basicConfig(level=logging.INFO) compare_sorce = self.database.compare_source(source=self.source) if compare_sorce: pass else: self.database.save_source(source=self.source) self.logger = logging.getLogger('Class:FlashOnions') self.session = requests.session() self.proxies = { 'http': 'socks5h://localhost:9050', } @property def start(self): self.database.replaces() self.flash_onion() def flash_onion(self): url = 'http://vps7nsnlz3n4ckiie5evi5oz2znes7p57gmrvundbmgat22luzd4z2id.onion/' self.logger.info(' Conectando em {url}'.format(url=url)) try: request = self.session.get(url, proxies=self.proxies, timeout=1000) if request.status_code == 200: soup = BeautifulSoup(request.content, features="lxml") pages = [] for number_pages in soup.find('div', {'class':'pagination'}).findAll('a'): pages.append(number_pages.get_text()) cont = 0 urls = [] while cont <= int(pages[-1]): cont += 1 urls.append("{url}?search_title_only=on&search=&rep=n%2Fa&page={number}".format( number=cont-1, url=url )) onions = [] for connect in urls: time.sleep(4) self.logger.info(' Conectando em {url}'.format(url=connect)) request = self.session.get(url, proxies=self.proxies, timeout=1000) if request.status_code == 200: soup = BeautifulSoup(request.content, features="lxml") for raw in soup.find('table', {'class':'domain_list'}).findAll('a'): if 'http://' in raw['href']: onions.append(raw['href']) keywords = [ 'forum', 'press', 'search', 'introduction', 'arabic', 'chinese', 'french', 'german', 'italian', 'japanese', 'polish', 'portuguese', 'russians', 'Spanish', 'hardcore', 'softcore', 'erotica', 'fetish', 'violence', 'escorts', 'p**n', 'domains', 'file', 'pastebin', 'proxies', 'web', 'blog', 'books', 'bbs', 'chans', 'wiki', 'social', 'Social', 'activism', 'paranormal', 'politics', 'religion', 'whistleblowing', 'development', 'c++', 'c#', 'python', 'HTML', 'ruby', 'jupyter', 'java', 'javascript', 'java', 'hacker', 'blackbox', 'read', 'blackhat', 'cracked', 'wordlist', 'word', 'hacked', 'blueteam', 'Phishing', 'Malware', 'Lamer', 'Cracker', 'Defacer', 'Spyware', 'Scammers', 'DDOS', 'SQL', 'sql', 'Botnet', 'Exploit', 'Script', 'zero', '0day', 'zeroday', 'Cybersecurity', 'Cyber', 'Hacktivism', 'Hacktivist', 'Keylogger', 'Blacklist', 'ai', 'bitcoin', 'Equifax', 'Nessus', 'openvas', 'securitycenter', 'Truecrypt', 'ClamAV', 'OSSEC', 'paloalto', 'BackTrack', 'OSSIM', 'IPCop', 'Okta', 'sonicwall', 'pfsense', 'Metasploit', 'OpenSSH', 'Wireshark', 'NStealth', 'drugs', 'drug-shop', 'Acid', 'Asteroid', 'Berry', 'Poker', 'games', 'Multiplayer', 'Play', 'activism', 'Casino', '.mp3', '.mp4', 'Video', 'Filme', 'Movie', 'channel', 'message', 'conclusion', 'termination', 'heading', 'headline', 'english', 'mandarin', 'hindustani', 'arabic', 'malay', 'bengali', 'sex', 'sexy', 'sexo', 'sexual', 'LGBT', 'Abuse', 'local', 'ebook', 'ebooks', 'social', 'christianity', 'islam', 'nonreligious', 'secular', 'secular', 'agnostic', 'atheist', 'hinduism', 'buddhism', 'spiritism', 'judaism', 'primal-indigenous', 'php', 'visual', 'C++', 'delphi', 'pascal', 'cobol', 'Cyberark', 'Firewall', 'antivirus', 'marijuana', 'weed', 'cocaine', 'heroin', 'cannabis', 'crack', 'ecstasy', 'amphetamines', 'lsd', 'singleplayer', 'TV', 'television', 'radio', ] for term in keywords: time.sleep(2) query = urllib.parse.quote(term) search = "{url}/?rep=n%2Fa&search={term}&submit=Go+%3E%3E%3E".format( url=url, term=term) self.logger.info(' Conectando em {url}'.format(url=search)) request = self.session.get(url, proxies=self.proxies, timeout=1000) if request.status_code == 200: soup = BeautifulSoup(request.content, features="lxml") for raw in soup.find('table', {'class':'domain_list'}).findAll('a'): if 'http://' in raw['href']: onions.append(raw['href']) self.logger.info(' Aplicando REGEX. Aguarde...') regex = re.compile("[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion") for lines in onions: rurls = lines \ .replace('\xad', '') \ .replace('\n', '') \ .replace("http://", '') \ .replace("https://", '') \ .replace(r'\s', '') \ .replace('\t', '') xurl = regex.match(rurls) if xurl is not None: self.database.saveonion( url=xurl.group(), source=self.source) except(requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError, requests.exceptions.ReadTimeout, requests.exceptions.InvalidURL) as e: self.logger.error(' Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e)) pass
def __init__(self, host_db=None, user_db=None, password_db=None, database=None): self.host_db = host_db self.user_db = user_db self.password_db = password_db self.database_name = database self.database = DataBase( host_db=self.host_db, user_db=self.user_db, password_db=self.password_db, database=self.database_name, ) logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger('DataBase') self.categories_names = { "1": "Other", "2": "Communications", "3": "Core Sites", "4": "Other Languages", "5": "Adult", "6": "Hosting", "7": "Personal", "8": "Social", "9": "Politics and Religion", "10": "Developer", "11": "Hacking", "12": "Security", "13": "Drug", "14": "Games", "15": "Script" } self.categories_term_Communications = [ "forum", "press", "chat", ] self.categories_term_Core_Sites = [ "search", "introduction point", ] self.categories_term_Other_Languages = [ "arabic", "chinese", "french", "german", "italian", "japanese", "polish", "portuguese", "russians", "Spanish", ] self.categories_term_Adult = [ "hardcore", "softcore", "erotica", "fetish", "violence", "escorts", "p**n", ] self.categories_term_Hosting = [ "domains", "file Hosting", "pastebin", "proxies", "web hosting", ] self.categories_term_Personal = [ "blog", "books", "pages", ] self.categories_term_Social = [ "bbs", "chans", "wiki", "social network", ] self.categories_term_Politics_and_Religion = [ "activism", "law", "paranormal", "politics", "religion", "whistleblowing", ] self.categories_term_Developer = [ "development", "c++", "c#", "python", "HTML", "ruby", "jupyter notebook", "java script", "java", ] self.categories_term_Hacking = [ "hacker", "blackbox", "read team", "redhat", "blackhat", "word", "cracked", "hacked", "blueteam", "Phishing", "Malware", "Lamer", "Cracker", "Defacer", "Spyware", "Ciberpirata", "Freiro", "Scammers", "Uc", "RAT", "DDOS", "FUD", "SQL", "XSS", "Skid", "Malware", "VPS", "ANSI Bomb", "Back Door", "Bot", "Botnet", "Buffer Overflow", "Cracker", "DOX", "Exploit", "Rainbow Table", "Root", "Reverse Engineering", "Shell", "Script Kiddie", "Spoof", "SQL Injection", "Trojan", "worm", "zero day exploit", ] self.categories_term_Security = [ "Sniffr", "wash", "Pay", "Shield", "Private", "Strategic", "Intelligence", "Safe", "Bitcoin", "Anonymity", ] self.categories_term_Drug = [ "drugs", "drug-shop", "Acid", "Asteroid", "Berry", ] self.categories_term_Games = [ "Poker", "games", "Multiplayer", "Play Free", "Casino", ]