def test_tor_ping_check(): ini = Ini('files/config.ini') with Socket(tor_network=True, ini=ini) as socket: is_opened = socket.ping_check('facebookcorewwwi.onion', 80) assert is_opened == True is_closed = socket.ping_check('facebookcorewwwi.onion', 31337) assert is_closed == False
def test_tor_http(): ini = Ini('files/config.ini') response = HTTP().request(url='https://facebookcorewwwi.onion', tor_network=True, ini=ini) assert response if response: assert response.headers
def test_read_ini_file(): """Test for reading ini file and parse key.""" ini = Ini(os.path.join('files', 'config.ini')) # compare binary path binary_path = ini.read('HEADLESS', 'PATH') assert binary_path == 'files/phantomjs' # compare elasticsearch host es_host = ini.read('ELASTICSEARCH', 'HOST') assert es_host
def run_crawler(self, url): Log.i(f"Starting crawler task for {url}") crawler = Crawler(ini=Ini(Env.read("CONFIG_FILE"))) report = crawler.scan(url) if not report.is_empty() and report.webpage.url == url: crawler.save(self.request.id, report) del crawler
def test_load_crawler(): ini = Ini('files/config.ini') crawler = Crawler(ini) assert crawler report = crawler.scan('http://wikitjerrta4qgz4.onion') assert type(report) == DynamicObject assert report.webpage.url == 'http://wikitjerrta4qgz4.onion' assert report.webpage.domain == 'wikitjerrta4qgz4.onion' del crawler
class SourceBase(object): """Base source object class format.""" urls = [] ini = Ini(Env.read('CONFIG_FILE')) active = True # collector status def collect(self): """ Run user custom method. :return: """ pass def save(self): """ Save domain on database and request crawling. :return: None """ engine = Engine.create(self.ini) with Session(engine=engine) as session: for url in self.urls: task_id = uuid4().hex try: # add url into database session.add(Domain(uuid=task_id, url=url)) session.commit() task = run_crawler.apply_async(args=(url, ), task_id=task_id) Log.i("Crawler issued a new task id {} at {}".format( task.task_id, url)) except: Log.d( "This {} url already saved into database.".format(url)) finally: self.urls.remove(url)
from database.session import Session from database.engine import Engine from database.models import Domain from utils.config.env import Env from utils.config.ini import Ini ini = Ini(Env.read('CONFIG_FILE')) def test_create_engine(): """ Test for create a new engine :return: """ assert Engine.create(ini=ini) def test_database_session(): """ Test for connect database session :return: """ engine = Engine.create(ini=ini) with Session(engine=engine) as session: assert session def test_manage_model(): """ Test for create a new table at memory database
from utils.config.ini import Ini from utils.network.headless import HeadlessBrowser from utils.network.headless import InvalidURLException, InvalidHTMLException import pytest ini = Ini('files/config.ini') def test_browser(): """Test for running headless browser.""" browser = HeadlessBrowser(ini=ini) browser.run(url='https://www.naver.com') screenshot = browser.get_screenshot() assert screenshot del browser def test_tor_browser(): """Test for running headless browser with tor proxy.""" browser = HeadlessBrowser( ini=ini, tor_network=True ) browser.run(url='http://wikitjerrta4qgz4.onion') screenshot = browser.get_screenshot()