def test_empty_dir(self): dirManager = DirManager() tmpdir = tempfile.mkdtemp() open(os.path.join(tmpdir,"test"), "w").close() open(os.path.join(tmpdir,"other"), "w").close() dirManager.add_directory(tmpdir) self.assertTrue(isinstance(dirManager.list_available(), dict))
def test_empty_dir(self): dirManager = DirManager() tmpdir = tempfile.mkdtemp() open(os.path.join(tmpdir, "test"), "w").close() open(os.path.join(tmpdir, "other"), "w").close() dirManager.add_directory(tmpdir) self.assertTrue(isinstance(dirManager.list_available(), dict))
class TestWithDir(TestSetup): def setUp(self): self.dm = DirManager() self.dm.add_directory(test_dir) def tearDown(self): self.dm.del_directory(test_dir)
def insertCandidates(self, numDownloads, CandidateName): print('Processing {} for {}'.format(numDownloads, CandidateName)) insertCandidateFolder = DirManager(['insertCandidateControlled']) insertCandidateFolder.createFolder() new_folder = insertCandidateFolder.getDirectory() filenames = sorted( [self.download_dir + "/" + f for f in listdir(self.download_dir)], key=path.getmtime) candidateHeader = "CandidateControlledName" for fullfilepathname in filenames[-numDownloads:]: filename = path.basename(fullfilepathname) wb = xlrd.open_workbook(fullfilepathname, logfile=open(devnull, 'w')) errordTypes = [ 'Cmte_ID', 'Intr_Nam L', 'Intr_City', 'Intr_ST', 'Off_S_H_Cd', 'XRef_Match' ] data = pd.read_excel( wb, dtype={datatype: str for datatype in errordTypes}) if CandidateName == " ": data.insert(0, candidateHeader, "Independent") else: data.insert(0, candidateHeader, CandidateName) data.to_excel('{}/{}'.format(new_folder, filename), index=False)
def aggregateData(self): # Create new directory for storing aggregated data # download_folder = path.abspath(path.join(self.download_dir, pardir)) aggregateFolder = DirManager(['aggregated_data']) aggregateFolder.createFolder() new_folder = aggregateFolder.getDirectory() new_csv_file = '{}/data.csv'.format(new_folder) insertColumsFolder = self.insertCandidateFolder.getDirectory() filenames = sorted([ insertColumsFolder + "/" + f for f in listdir(insertColumsFolder) ], key=path.getmtime) with open(new_csv_file, 'w') as new_aggregate_csv: new_worksheet = csv.writer(new_aggregate_csv, quoting=csv.QUOTE_ALL) # Loop through all workbooks (EXCEL) header = False for filename in filenames: # Open worksheet wb = xlrd.open_workbook(filename) sheet = wb.sheet_by_index(0) # Only pull excel header from the first file to reduce duplicates if header: for rownum in range(1, sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) else: for rownum in range(sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) header = True
def __init__(self, path, prefix, cauldronAddr, ignore=[ r'^\.', r'\.x?swp$', r'~', r'^__', r'__$', r'\.jar$', r'\.db$' ]): syslog.openlog(ident="X3Coven", facility=syslog.LOG_DAEMON) self.path = path self.prefix = prefix self.cauldronAddr = cauldronAddr self.plugins = {} self.plugins_lock = Lock() self.confRE = re.compile('^#\\s*x3\.([a-z0-9.]+)\\s*=\\s*(.*)\\s*$') self.dirManager = DirManager(self.path, self, ignore=ignore) self.cauldron = CauldronSender(self.cauldronAddr)
def __init__( self,path,prefix,cauldronAddr, ignore=[r'^\.',r'\.x?swp$',r'~',r'^__',r'__$',r'\.jar$',r'\.db$']): syslog.openlog(ident="X3Coven",facility=syslog.LOG_DAEMON) self.path=path self.prefix=prefix self.cauldronAddr=cauldronAddr self.plugins={} self.confRE=re.compile('^#\\s*x3\.([a-z0-9.]+)\\s*=\\s*(.*)\\s*$') self.dirManager=DirManager(self.path,self,ignore=ignore) self.cauldron=CauldronSender(self.cauldronAddr)
def __init__(self): self.DEFAULT_SLEEP_TIME = 5 self.SEARCH_FORM_ADDRESS = "https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx" # create data folder in current directory to store files self.website = SjcWebsite() self.new_dir = DirManager(["data"]) self.new_dir.createFolder() self.download_dir = self.new_dir.getDirectory() self.website.preprocessing = PreProcessing(self.download_dir) options = webdriver.ChromeOptions() # Uncomment block BELOW for headless data-retrieval # --> Currently not working 100%, only downloads first link on form table isHeadless = os.environ.get('HEADLESS', False) if isHeadless: options.add_argument("--headless") # options.add_argument("--disable-gpu") # options.add_argument("--window-size=1280,800") # Uncomment block ABOVE for headless data-retrieval options.add_argument("--ignore-certificate-errors") options.add_argument("--test_type") options.add_argument("--no-sandbox") options.add_argument("start-maximized") options.add_argument("disable-infobars") options.add_argument("--disable-extensions") plugs = {"enabled": False, "name": "Chrome PDF Viewer"} prefs = { "download.default_directory": self.download_dir, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": False, "safebrowsing.disable_download_protection": True, "plugins.plugins_list": [plugs], } options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def insertColumns(self, numDownloads, CandidateName, ElectionDate, BallotItem): print('Processing {} for {}'.format(numDownloads, CandidateName)) if numDownloads == 0: return self.insertCandidateFolder = DirManager(['insertedData']) self.insertCandidateFolder.createFolder() new_folder = self.insertCandidateFolder.getDirectory() filenames = self.insertColumnsHelper() candidateHeader = "CandidateControlledName" electionDateHeader = "Election Date" ballotItemHeader = "Ballot Item" print(filenames) for fullfilepathname in filenames[-numDownloads:]: filename = path.basename(fullfilepathname) print(filename) wb = xlrd.open_workbook(fullfilepathname, logfile=open(devnull, 'w')) errordTypes = [ 'Cmte_ID', 'Intr_Nam L', 'Intr_City', 'Intr_ST', 'Off_S_H_Cd', 'XRef_Match' ] data = pd.read_excel( wb, dtype={datatype: str for datatype in errordTypes}) if CandidateName == " ": data.insert(0, candidateHeader, "Independent") else: data.insert(0, candidateHeader, CandidateName) data.insert(0, electionDateHeader, ElectionDate) data.insert(0, ballotItemHeader, BallotItem) data.to_excel('{}/{}'.format(new_folder, filename), index=False)
def test_available_dict(self): dirManager = DirManager() json = dict(dirManager.list_available_nested_dict()) self.assertTrue(isinstance(json, dict))
def test_get_directory(self): dirManager = DirManager() dirManager.add_directory("./teste/") self.assertTrue("./teste/" in dirManager.get_directory()) self.assertTrue("./teste/" == dirManager.get_directory("./teste/"))
def test_available(self): dirManager = DirManager() self.assertTrue(isinstance(dirManager.list_available(), dict))
def test_random_list(self): dirManager = DirManager() dirManager.add_directory("../test_files/") x = dirManager.random_list() self.assertTrue(True)
def test_found_entry(self): dirManager = DirManager() dirManager.add_directory(test_dir) ans = dirManager.found_entry(dot) self.assertTrue(ans == "../test_files/DOt_-_05_-_IMF.mp3")
class Coven(object): def __init__(self, path, prefix, cauldronAddr, ignore=[ r'^\.', r'\.x?swp$', r'~', r'^__', r'__$', r'\.jar$', r'\.db$' ]): syslog.openlog(ident="X3Coven", facility=syslog.LOG_DAEMON) self.path = path self.prefix = prefix self.cauldronAddr = cauldronAddr self.plugins = {} self.plugins_lock = Lock() self.confRE = re.compile('^#\\s*x3\.([a-z0-9.]+)\\s*=\\s*(.*)\\s*$') self.dirManager = DirManager(self.path, self, ignore=ignore) self.cauldron = CauldronSender(self.cauldronAddr) def getOSPath(self, name): return path.join(self.path, name) def isShebangExecutable(self, path): try: with open(path, 'rb') as plugin_file: magic_bytes = plugin_file.read(2) if magic_bytes == b'#!': return True except: pass return False def getConfig(self, name): pluginPath = self.getOSPath(name) if not self.isShebangExecutable(pluginPath): return None conf = {} try: with open(pluginPath, 'r') as plugin_file: for line in plugin_file: config_match = self.confRE.match(line) if config_match: conf[config_match.group(1)] = config_match.group(2) except: return None return conf def put(self, label, value): self.cauldron.put('%s.coven.%s' % (self.prefix, label), value) def log(self, message, component=None, priority=syslog.LOG_DEBUG): syslog.syslog( priority, message if component == None else '[%s] %s' % (component, message)) def practice(self): try: self.dirManager.start() while True: totals = {} with self.plugins_lock: self.put('plugins.running', len(self.plugins)) for plugin in self.plugins.values(): for (values_type, values) in plugin.get_counters().items(): try: total_values = totals[values_type] except KeyError: total_values = {} totals[values_type] = total_values for (label, value) in values.items(): try: total_values[label] += value except KeyError: total_values[label] = value self.put( 'plugin.%s.%s.%s' % (plugin.name, values_type, label), value) plugin.reset_counters() for (values_type, values) in totals.items(): for (label, value) in values.items(): self.put('plugins.total.%s.%s' % (values_type, label), value) time.sleep(1.0) except KeyboardInterrupt: self.log("Stopping..") self.dirManager.stop() with self.plugins_lock: for partingPlugin in self.plugins.values(): partingPlugin.stop() for partingPlugin in self.plugins.values(): partingPlugin.join() self.log("Stopped..", priority=syslog.LOG_INFO) # ---------------------------------------------------------------------- def startPlugin(self, name): self.log("Starting %s" % (name)) config = self.getConfig(name) if config: try: with self.plugins_lock: self.plugins[name] = Plugin(name, self.getOSPath(name), self.prefix, config, self.cauldronAddr, self.log) self.plugins[name].start() self.log("Started %s" % (name), priority=syslog.LOG_INFO) except: self.log("Failed To Start %s" % (name), priority=syslog.LOG_ERR) else: self.log("Not an X3 Plugin: %s" % (name), priority=syslog.LOG_WARNING) def stopPlugin(self, name): self.log("Stopping %s" % (name)) try: with self.plugins_lock: self.plugins[name].stop() del self.plugins[name] self.log("Stopped %s" % (name), priority=syslog.LOG_INFO) except: self.log("Failed To Stop %s" % (name), priority=syslog.LOG_ERR) # ---------------------------------------------------------------------- def process_FileCreated(self, name): self.startPlugin(name) def process_FileDeleted(self, name): self.stopPlugin(name) def process_FileChanged(self, name): self.log("Restarting %s" % (name), priority=syslog.LOG_INFO) self.stopPlugin(name) self.startPlugin(name)
def setUpClass(cls): global dirmanager config.base_dir = tempfile.mkdtemp() config.__reload__() dirmanager = DirManager()
def setUp(self): self.dm = DirManager() self.dm.add_directory(test_dir)
class PreProcessing(): def __init__(self, scraper_download_dir): download_file_dir_wildcard = '{}/*.xls'.format(scraper_download_dir) self.filenames = glob.glob(download_file_dir_wildcard) self.download_dir = scraper_download_dir def aggregateData(self): # Create new directory for storing aggregated data # download_folder = path.abspath(path.join(self.download_dir, pardir)) aggregateFolder = DirManager(['aggregated_data']) aggregateFolder.createFolder() new_folder = aggregateFolder.getDirectory() new_csv_file = '{}/data.csv'.format(new_folder) insertColumsFolder = self.insertCandidateFolder.getDirectory() filenames = sorted([ insertColumsFolder + "/" + f for f in listdir(insertColumsFolder) ], key=path.getmtime) with open(new_csv_file, 'w') as new_aggregate_csv: new_worksheet = csv.writer(new_aggregate_csv, quoting=csv.QUOTE_ALL) # Loop through all workbooks (EXCEL) header = False for filename in filenames: # Open worksheet wb = xlrd.open_workbook(filename) sheet = wb.sheet_by_index(0) # Only pull excel header from the first file to reduce duplicates if header: for rownum in range(1, sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) else: for rownum in range(sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) header = True def insertColumns(self, numDownloads, CandidateName, ElectionDate, BallotItem): print('Processing {} for {}'.format(numDownloads, CandidateName)) if numDownloads == 0: return self.insertCandidateFolder = DirManager(['insertedData']) self.insertCandidateFolder.createFolder() new_folder = self.insertCandidateFolder.getDirectory() filenames = self.insertColumnsHelper() candidateHeader = "CandidateControlledName" electionDateHeader = "Election Date" ballotItemHeader = "Ballot Item" print(filenames) for fullfilepathname in filenames[-numDownloads:]: filename = path.basename(fullfilepathname) print(filename) wb = xlrd.open_workbook(fullfilepathname, logfile=open(devnull, 'w')) errordTypes = [ 'Cmte_ID', 'Intr_Nam L', 'Intr_City', 'Intr_ST', 'Off_S_H_Cd', 'XRef_Match' ] data = pd.read_excel( wb, dtype={datatype: str for datatype in errordTypes}) if CandidateName == " ": data.insert(0, candidateHeader, "Independent") else: data.insert(0, candidateHeader, CandidateName) data.insert(0, electionDateHeader, ElectionDate) data.insert(0, ballotItemHeader, BallotItem) data.to_excel('{}/{}'.format(new_folder, filename), index=False) def insertColumnsHelper(self): partial_download = True filenames = sorted([ self.download_dir + FILE_DIVIDER + f for f in listdir(self.download_dir) ], key=path.getmtime) while partial_download: filename = path.basename(filenames[-1]) print(filename) if "transactionExportGrid" in filename and "crdownload" not in filename: partial_download = False else: sleep(3) filenames = sorted([ self.download_dir + FILE_DIVIDER + f for f in listdir(self.download_dir) ], key=path.getmtime) return filenames
class Scraper: def __init__(self): self.DEFAULT_SLEEP_TIME = 5 self.SEARCH_FORM_ADDRESS = "https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx" # create data folder in current directory to store files self.website = SjcWebsite() self.new_dir = DirManager(["data"]) self.new_dir.createFolder() self.download_dir = self.new_dir.getDirectory() self.website.preprocessing = PreProcessing(self.download_dir) options = webdriver.ChromeOptions() # enable headless data retrieval isHeadless = os.environ.get('HEADLESS', True) if isHeadless: options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--window-size=1280,800") options.add_argument("--ignore-certificate-errors") options.add_argument("--test_type") options.add_argument("--no-sandbox") options.add_argument("start-maximized") options.add_argument("disable-infobars") options.add_argument("--disable-extensions") plugs = {"enabled": False, "name": "Chrome PDF Viewer"} prefs = { "download.default_directory": self.download_dir, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": False, "safebrowsing.disable_download_protection": True, "plugins.plugins_list": [plugs], } options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) def scrape(self, election_cycle=None): # Navigate to https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx self.website.navigateToSearchPage(self.driver, self.SEARCH_FORM_ADDRESS, election_cycle=election_cycle) self.website.verifySearchTableLoadComplete(self.driver) countFile = 0 for search_page_num in range(1, self.website.numPages(self.driver) + 1): print('PAGE {}'.format(search_page_num)) # Need to navigate to the page upfront so that when we get the number of entries on the page it is accurate. self.website.navigateToPage(self.driver, search_page_num) for entry_index in self.website.numTableEntries( self.driver, search_page_num ): print('INDEX {}'.format(entry_index)) # will result in the website bringing us back to page 1. self.website.navigateToPage(self.driver, search_page_num) self.website.extractTableData(self.driver, entry_index) self.website.clickEntryIndex(self.driver, entry_index % 10) sleep(self.DEFAULT_SLEEP_TIME) if self.website.errorDialogExists(self.driver): # If there are no forms for a specific entry, we get an error message. self.website.closeErrorDialog(self.driver) else: # If there are forms, then we will be brought to the "forms" page. self.website.verifyDownloadFormTableLoadComplete(self.driver) countFile = self.website.downloadExcel(self.driver, countFile) self.website.clickBackButton(self.driver) self.website.verifySearchTableLoadComplete(self.driver) # Close browser once scrape is complete self.driver.quit() # Custom module to aggregate data into single CSV self.website.preprocessing.aggregateData()
class Coven(object): def __init__( self,path,prefix,cauldronAddr, ignore=[r'^\.',r'\.x?swp$',r'~',r'^__',r'__$',r'\.jar$',r'\.db$']): syslog.openlog(ident="X3Coven",facility=syslog.LOG_DAEMON) self.path=path self.prefix=prefix self.cauldronAddr=cauldronAddr self.plugins={} self.confRE=re.compile('^#\\s*x3\.([a-z0-9.]+)\\s*=\\s*(.*)\\s*$') self.dirManager=DirManager(self.path,self,ignore=ignore) self.cauldron=CauldronSender(self.cauldronAddr) def getOSPath(self,name): return path.join(self.path,name) def isShebangExecutable(self,path): try: with open(path,'rb') as plugin_file: magic_bytes=plugin_file.read(2) if magic_bytes==b'#!': return True except: pass return False def getConfig(self,name): pluginPath=self.getOSPath(name) if not self.isShebangExecutable(pluginPath): return None conf={} try: with open(pluginPath,'r') as plugin_file: for line in plugin_file: config_match=self.confRE.match(line) if config_match: conf[config_match.group(1)]=config_match.group(2) except: return None return conf def put(self,label,value): self.cauldron.put('%s.coven.%s' % (self.prefix,label),value) def log(self,message,component=None,priority=syslog.LOG_DEBUG): syslog.syslog(priority,message if component==None else '[%s] %s' % (component,message)) def practice(self): try: self.dirManager.start() while True: totals={} self.put('plugins.running',len(self.plugins)) for plugin in self.plugins.values(): for (values_type,values) in plugin.get_counters().items(): try: total_values=totals[values_type] except KeyError: total_values={} totals[values_type]=total_values for (label,value) in values.items(): try: total_values[label]+=value except KeyError: total_values[label]=value self.put('plugin.%s.%s.%s' % (plugin.name,values_type,label),value) plugin.reset_counters() for (values_type,values) in totals.items(): for (label,value) in values.items(): self.put('plugins.total.%s.%s' % (values_type,label),value) time.sleep(1.0) except KeyboardInterrupt: self.log("Stopping..") self.dirManager.stop() for partingPlugin in self.plugins.values(): partingPlugin.stop() for partingPlugin in self.plugins.values(): partingPlugin.join() self.log("Stopped..",priority=syslog.LOG_INFO) # ------------------------------------------------------------------------ def startPlugin(self,name): self.log("Starting %s" % (name)) config=self.getConfig(name) if config: try: self.plugins[name]=Plugin(name, self.getOSPath(name), self.prefix, config, self.cauldronAddr, self.log) self.plugins[name].start() self.log("Started %s" % (name),priority=syslog.LOG_INFO) except: self.log("Failed To Start %s" % (name),priority=syslog.LOG_ERR) else: self.log("Not an X3 Plugin: %s" % (name), priority=syslog.LOG_WARNING) def stopPlugin(self,name): self.log("Stopping %s" % (name)) try: self.plugins[name].stop() del self.plugins[name] self.log("Stopped %s" % (name),priority=syslog.LOG_INFO) except: self.log("Failed To Stop %s" % (name),priority=syslog.LOG_ERR) # ------------------------------------------------------------------------ def process_FileCreated(self,name): self.startPlugin(name) def process_FileDeleted(self,name): self.stopPlugin(name) def process_FileChanged(self,name): self.log("Restarting %s" % (name),priority=syslog.LOG_INFO) self.stopPlugin(name) self.startPlugin(name)