def aggregateData(self): # Create new directory for storing aggregated data # download_folder = path.abspath(path.join(self.download_dir, pardir)) aggregateFolder = DirManager(['aggregated_data']) aggregateFolder.createFolder() new_folder = aggregateFolder.getDirectory() new_csv_file = '{}/data.csv'.format(new_folder) insertColumsFolder = self.insertCandidateFolder.getDirectory() filenames = sorted([ insertColumsFolder + "/" + f for f in listdir(insertColumsFolder) ], key=path.getmtime) with open(new_csv_file, 'w') as new_aggregate_csv: new_worksheet = csv.writer(new_aggregate_csv, quoting=csv.QUOTE_ALL) # Loop through all workbooks (EXCEL) header = False for filename in filenames: # Open worksheet wb = xlrd.open_workbook(filename) sheet = wb.sheet_by_index(0) # Only pull excel header from the first file to reduce duplicates if header: for rownum in range(1, sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) else: for rownum in range(sheet.nrows): new_worksheet.writerow(sheet.row_values(rownum)) header = True
def test_empty_dir(self): dirManager = DirManager() tmpdir = tempfile.mkdtemp() open(os.path.join(tmpdir, "test"), "w").close() open(os.path.join(tmpdir, "other"), "w").close() dirManager.add_directory(tmpdir) self.assertTrue(isinstance(dirManager.list_available(), dict))
def insertCandidates(self, numDownloads, CandidateName): print('Processing {} for {}'.format(numDownloads, CandidateName)) insertCandidateFolder = DirManager(['insertCandidateControlled']) insertCandidateFolder.createFolder() new_folder = insertCandidateFolder.getDirectory() filenames = sorted( [self.download_dir + "/" + f for f in listdir(self.download_dir)], key=path.getmtime) candidateHeader = "CandidateControlledName" for fullfilepathname in filenames[-numDownloads:]: filename = path.basename(fullfilepathname) wb = xlrd.open_workbook(fullfilepathname, logfile=open(devnull, 'w')) errordTypes = [ 'Cmte_ID', 'Intr_Nam L', 'Intr_City', 'Intr_ST', 'Off_S_H_Cd', 'XRef_Match' ] data = pd.read_excel( wb, dtype={datatype: str for datatype in errordTypes}) if CandidateName == " ": data.insert(0, candidateHeader, "Independent") else: data.insert(0, candidateHeader, CandidateName) data.to_excel('{}/{}'.format(new_folder, filename), index=False)
def __init__(self, path, prefix, cauldronAddr, ignore=[ r'^\.', r'\.x?swp$', r'~', r'^__', r'__$', r'\.jar$', r'\.db$' ]): syslog.openlog(ident="X3Coven", facility=syslog.LOG_DAEMON) self.path = path self.prefix = prefix self.cauldronAddr = cauldronAddr self.plugins = {} self.plugins_lock = Lock() self.confRE = re.compile('^#\\s*x3\.([a-z0-9.]+)\\s*=\\s*(.*)\\s*$') self.dirManager = DirManager(self.path, self, ignore=ignore) self.cauldron = CauldronSender(self.cauldronAddr)
def __init__(self): self.DEFAULT_SLEEP_TIME = 5 self.SEARCH_FORM_ADDRESS = "https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx" # create data folder in current directory to store files self.website = SjcWebsite() self.new_dir = DirManager(["data"]) self.new_dir.createFolder() self.download_dir = self.new_dir.getDirectory() self.website.preprocessing = PreProcessing(self.download_dir) options = webdriver.ChromeOptions() # Uncomment block BELOW for headless data-retrieval # --> Currently not working 100%, only downloads first link on form table isHeadless = os.environ.get('HEADLESS', False) if isHeadless: options.add_argument("--headless") # options.add_argument("--disable-gpu") # options.add_argument("--window-size=1280,800") # Uncomment block ABOVE for headless data-retrieval options.add_argument("--ignore-certificate-errors") options.add_argument("--test_type") options.add_argument("--no-sandbox") options.add_argument("start-maximized") options.add_argument("disable-infobars") options.add_argument("--disable-extensions") plugs = {"enabled": False, "name": "Chrome PDF Viewer"} prefs = { "download.default_directory": self.download_dir, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": False, "safebrowsing.disable_download_protection": True, "plugins.plugins_list": [plugs], } options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def insertColumns(self, numDownloads, CandidateName, ElectionDate, BallotItem): print('Processing {} for {}'.format(numDownloads, CandidateName)) if numDownloads == 0: return self.insertCandidateFolder = DirManager(['insertedData']) self.insertCandidateFolder.createFolder() new_folder = self.insertCandidateFolder.getDirectory() filenames = self.insertColumnsHelper() candidateHeader = "CandidateControlledName" electionDateHeader = "Election Date" ballotItemHeader = "Ballot Item" print(filenames) for fullfilepathname in filenames[-numDownloads:]: filename = path.basename(fullfilepathname) print(filename) wb = xlrd.open_workbook(fullfilepathname, logfile=open(devnull, 'w')) errordTypes = [ 'Cmte_ID', 'Intr_Nam L', 'Intr_City', 'Intr_ST', 'Off_S_H_Cd', 'XRef_Match' ] data = pd.read_excel( wb, dtype={datatype: str for datatype in errordTypes}) if CandidateName == " ": data.insert(0, candidateHeader, "Independent") else: data.insert(0, candidateHeader, CandidateName) data.insert(0, electionDateHeader, ElectionDate) data.insert(0, ballotItemHeader, BallotItem) data.to_excel('{}/{}'.format(new_folder, filename), index=False)
def test_found_entry(self): dirManager = DirManager() dirManager.add_directory(test_dir) ans = dirManager.found_entry(dot) self.assertTrue(ans == "../test_files/DOt_-_05_-_IMF.mp3")
def test_random_list(self): dirManager = DirManager() dirManager.add_directory("../test_files/") x = dirManager.random_list() self.assertTrue(True)
def test_available_dict(self): dirManager = DirManager() json = dict(dirManager.list_available_nested_dict()) self.assertTrue(isinstance(json, dict))
def test_available(self): dirManager = DirManager() self.assertTrue(isinstance(dirManager.list_available(), dict))
def test_get_directory(self): dirManager = DirManager() dirManager.add_directory("./teste/") self.assertTrue("./teste/" in dirManager.get_directory()) self.assertTrue("./teste/" == dirManager.get_directory("./teste/"))
def setUp(self): self.dm = DirManager() self.dm.add_directory(test_dir)
def setUpClass(cls): global dirmanager config.base_dir = tempfile.mkdtemp() config.__reload__() dirmanager = DirManager()