def __init__(self, inputfile, phishtank, openphish, cleanmx, output, alivecheck, emailnotifiers): logging.debug("Instantiating the '%s' class" % (self.__class__.__name__)) self._inputfile = inputfile self._phishtank = phishtank self._openphish = openphish self._cleanmx = cleanmx self._output = output self._alivecheck = alivecheck self._emailnotifiers = emailnotifiers check_passed = CheckDependencies.run() if not check_passed: exit() else: self._cfg = ConfigReader.run() if not self._inputfile: self._downloader = Downloader(self._cfg, self._phishtank, self._openphish, self._cleanmx) self._extractor = Extractor(self._cfg, self._phishtank, self._openphish, self._cleanmx) if self._alivecheck: AliveChecker.config = self._cfg if 'json' in self._output: JSONAdapter.config = self._cfg if 'csv' in self._output: CSVAdapter.config = self._cfg if 'xml' in self._output: XMLAdapter.config = self._cfg if self._emailnotifiers: EmailNotifiers.config = self._cfg
if __name__ == '__main__': urls_file = 'emotioNet_1.txt' links = get_urls(os.path.join(BASE_URL_PATH, urls_file)) loop = asyncio.get_event_loop() f_name, ext = os.path.splitext(urls_file) save_path = os.path.join(BASE_SAVE_PATH, f_name) if not os.path.exists(save_path): os.makedirs(save_path) log_file = os.path.join(BASE_LOGS_PATH, f_name + '.log') downloader = Downloader(links, save_path, log_file, headers, max_tasks=150, max_tries=4, max_sem=1000, conn_time=30, loop=loop) try: loop.run_until_complete(downloader.run()) except Exception as e: print(e) finally: loop.stop() loop.run_forever() loop.close()
from download.download_configuration import DownloadConfiguration, DateRange from download.downloader import Downloader import concurrent.futures import requests BASE_URL = "https://bulkdata.uspto.gov/" page = requests.get(BASE_URL) html = page.content soup = BeautifulSoup(html, "html.parser") # redbook is the pattern to match!s # https://bulkdata.uspto.gov/data/patent/grant/redbook/2021 # https://bulkdata.uspto.gov/data/patent/grant/redbook/2021/I20210105.tar date_range = DateRange(start=2021, end=2021) conf = DownloadConfiguration(Path("tarfiles"), date_range) df = DataFilter(conf) a = df.get_redbook_data(soup=soup) b = df.filter_by_year(a) c = df.get_all_download_urls(b) print(c) urls: List[str] = c downloader = Downloader(config=conf, d_filter=df) print(f"Number of URLS: {len(urls)}") with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: executor.map(downloader.download_tar, urls)
class Core: def __init__(self, inputfile, phishtank, openphish, cleanmx, output, alivecheck, emailnotifiers): logging.debug("Instantiating the '%s' class" % (self.__class__.__name__)) self._inputfile = inputfile self._phishtank = phishtank self._openphish = openphish self._cleanmx = cleanmx self._output = output self._alivecheck = alivecheck self._emailnotifiers = emailnotifiers check_passed = CheckDependencies.run() if not check_passed: exit() else: self._cfg = ConfigReader.run() if not self._inputfile: self._downloader = Downloader(self._cfg, self._phishtank, self._openphish, self._cleanmx) self._extractor = Extractor(self._cfg, self._phishtank, self._openphish, self._cleanmx) if self._alivecheck: AliveChecker.config = self._cfg if 'json' in self._output: JSONAdapter.config = self._cfg if 'csv' in self._output: CSVAdapter.config = self._cfg if 'xml' in self._output: XMLAdapter.config = self._cfg if self._emailnotifiers: EmailNotifiers.config = self._cfg def run(self): print("Running...") if not self._inputfile: self._downloader.run() ph_ws = self._extractor.run() else: logging.debug("Reading possible phishing website from '%s'" % (self._inputfile)) fp = open(self._inputfile, 'rb') lines = fp.readlines() logging.debug(lines) fp.close() ph_ws = dict() cm_elems = [] for line in lines: cm_elem = dict() cm_elem['domain'] = line.replace('\n','') cm_elem['url'] = line.replace('\n','') cm_elem['ip'] = 'n.d.' cm_elem['brand'] = 'custom' cm_elem['time'] = 'n.d.' cm_elem['country'] = 'n.d.' cm_elems.append(cm_elem) ph_ws['custom'] = cm_elems if self._alivecheck: ph_ws = AliveChecker.are_alive(ph_ws) if len(ph_ws) > 0: if 'json' in self._output: JSONAdapter.save_ph_ws(ph_ws) if 'csv' in self._output: CSVAdapter.save_ph_ws(ph_ws) if 'xml' in self._output: XMLAdapter.save_ph_ws(ph_ws) if self._emailnotifiers: EmailNotifiers.notify() else: logging.debug('Empty phishing_website list') print("Done...")