def scraper(): # make an HTTP web request to get the source information response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') table = bs4.BeautifulSoup(response.text, features="html.parser").select('#msdhTotalCovid-19Cases tbody tr') counties = [] for item in table: row = item.find_all('td') county_name = row[0].text confirmed = int(row[1].text.replace(',', '').replace('*', '')) deaths = int(row[2].text.replace(',', '').replace('*', '')) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): payload = '' filepath = pathlib.Path.cwd().joinpath('config', 'nv_post_body.json') with open(filepath, 'r') as file: payload = file.read().replace('\n', '') # make an HTTP web request to get the data response = requests.post(URL, data=payload) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') jsonPayload = json.loads(response.text) features = jsonPayload['results'][0]['result']['data']['dsr']['DS'][0][ 'PH'][0]['DM0'] counties = [] for feature in features: if 'S' in feature: continue county_object = feature['C'] has_R = 'R' in feature deaths = 0 cases_index = 3 if has_R: cases_index = 2 else: deaths = int(county_object[1]) county_name = county_object[0] confirmed = int(county_object[cases_index]) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the MI XLSX file response = requests.get(URL) counties = [] if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') temppath = 'temp' if not os.path.exists(temppath): os.makedirs(temppath) tempfilename = datetime.datetime.now().strftime( "%Y-%m-%d_%H%M%S") + '_temp_' + STATE_ABBR + '.xlsx' tempfilepath = pathlib.Path.cwd().joinpath('temp', tempfilename) with open(tempfilepath, "wb") as file: file.write(response.content) wb = openpyxl.load_workbook(filename=tempfilepath) sheet = wb.worksheets[0] max_rows = sheet.max_row max_cols = sheet.max_column for i in range(4, max_rows): rowCount = str(i) county_name = sheet['A' + rowCount].value if county_name == 'Unknown' or county_name == 'Total' or len( county_name) == 0: break confirmed = sheet.cell(row=i, column=max_cols).value county = county_report.CountyReport(STATE, county_name, (int)(confirmed), -1, -1, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the CA CSV file response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') csvData = response.text # read the in-memory string using the 'csv' module so we can iterate over each row csvReader = csv.reader(csvData.splitlines(), delimiter=',', quotechar='"') # create a list that will contain our county data counties = [] # iterate over every row in the CSV for row in csvReader: # skip the header row if row[0] == 'county': continue county_name = row[0] confirmedStr = row[1] confirmed = 0 if '.' in confirmedStr: confirmed = int(float(confirmedStr)) elif len(confirmedStr) > 0: confirmed = int(confirmedStr) deathsStr = row[2] deaths = 0 if '.' in deathsStr: deaths = int(float(deathsStr)) elif len(deathsStr) > 0: deaths = int(deathsStr) county = findCounty(county_name, counties) if county == None: county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) # append the countyReport to our list of counties else: county.confirmed = confirmed county.deaths = deaths # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the data response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') jsonPayload = json.loads(response.text) features = jsonPayload['features'] counties = [] for feature in features: attribute = feature['attributes'] county_name = attribute['NAME'] if county_name == None or len(county_name) == 0 or county_name == '' or county_name == 'WI': continue confirmed = int(attribute['POSITIVE']) hospitalizations = 0 if attribute['HOSP_YES'] != None: hospitalizations = int(attribute['HOSP_YES']) deaths = 0 if attribute['DEATHS'] != None: deaths = int(attribute['DEATHS']) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, hospitalizations, -1, datetime.datetime.now()) existing_county = findCounty(county_name, counties) if existing_county == None: counties.append(county) elif existing_county.confirmed < county.confirmed or existing_county.deaths < county.deaths or existing_county.hospitalizations < county.hospitalizations: existing_county.confirmed = county.confirmed existing_county.deaths = county.deaths existing_county.hospitalizations = county.hospitalizations # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the FL Json file response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') jsonPayload = json.loads(response.text) items = jsonPayload['features'] counties = [] for item in items: attributes = item['properties'] county_name = attributes['County_1'] if county_name == 'State': # this is FL's total, so skip continue confirmedStr = attributes['CasesAll'] confirmed = int(confirmedStr) deathsStr = attributes['Deaths'] deaths = int(deathsStr) hospitalizationsResStr = attributes[ 'C_HospYes_Res'] # hospitalizations - Florida residents hospitalizationsRes = int(hospitalizationsResStr) hospitalizationsNonResStr = attributes[ 'C_HospYes_NonRes'] # hospitalizations - Florida residents hospitalizationsNonRes = int(hospitalizationsNonResStr) county = county_report.CountyReport( STATE, county_name, confirmed, deaths, hospitalizationsRes + hospitalizationsNonRes, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the GA ZIP file response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': ZIP downloaded succeeded') # read ZIP into memory z = zipfile.ZipFile(io.BytesIO(response.content)) # extract the CSV file from the ZIP file into an in-memory byte array csvDataBytes = z.read('countycases.csv') # convert the byte array into a string so we can read it as a CSV file csvData = csvDataBytes.decode(encoding='UTF-8') # read the in-memory string using the 'csv' module so we can iterate over each row csvReader = csv.reader(csvData.splitlines(), delimiter=',', quotechar='"') # create a list that will contain our county data counties = [] # iterate over every row in the CSV for row in csvReader: # skip the header row if row[0] == 'county_resident': continue # take the row we're iterating over and build a countyReport object out of it - this has the confirmed cases, deaths, etc that we're interested in county = county_report.CountyReport(STATE, row[0], (int)(row[1]), (int)(row[2]), (int)(row[3]), (float)(row[4]), datetime.datetime.now()) counties.append( county) # append the countyReport to our list of counties # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : ZIP download failed - HTTP status code ', response.status_code)
def scraper(): counties = [] # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ browser = webdriver.Edge("msedgedriver.exe") browser.get(URL) counties_link = WebDriverWait(browser, 20).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="togConfirmedCasesDeathsTbl"]'))) counties_link.click() counties_table = WebDriverWait(browser, 20).until( EC.presence_of_element_located(( By.XPATH, '/html/body/form/div[3]/div/div/div[4]/div/div/div/div[2]/main/div/div[6]/div/div/div[3]/div/div/div/div/div/div/div/div[2]/div/div/table' ))) time.sleep(2) htmlRows = counties_table.find_elements_by_xpath(".//tbody/tr") rows = get_row_data(htmlRows) for row in rows: county_name = row[0] if county_name == 'Total' or county_name == 'Unassigned' or len( row) < 4: continue confirmed = int(row[1].replace(',', '')) hospitalizations = int(row[2].replace(',', '')) deaths = int(row[3].replace(',', '')) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, hospitalizations, -1, datetime.datetime.now()) counties.append(county) browser.quit() # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport
def scraper(): # make an HTTP web request to get the AK Json response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') jsonPayload = json.loads(response.text) features = jsonPayload['features'] counties = [] for feature in features: attribute = feature['attributes'] county_name = attribute['Borough_Census_Area'] confirmed = int(attribute['All_Cases']) hospitalizations = int(attribute['Hospitalizations']) deaths = int(attribute['Deaths']) county = findCounty(county_name, counties) if county == None: county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) counties.append(county) else: county.confirmed += confirmed county.hospitalizations += hospitalizations county.deaths += deaths # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the MI XLSX file response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') data = io.BytesIO(response.content) wb = openpyxl.load_workbook(filename=data, read_only=True, data_only=True) sheet = wb.worksheets[0] counties = [] for i in range(2, 169): rowCount = str(i) status = sheet['B' + rowCount].value if status == 'Confirmed': county = sheet['A' + rowCount].value confirmed = sheet['C' + rowCount].value deaths = sheet['D' + rowCount].value county = county_report.CountyReport(STATE, county, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) counties.append( county) # append the countyReport to our list of counties # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the CO Json response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') jsonPayload = json.loads(response.text) features = jsonPayload['features'] counties = [] for feature in features: attribute = feature['attributes'] county_name = attribute['LABEL'] county = findCounty(county_name, counties) if county == None: county = county_report.CountyReport(STATE, county_name, 0, 0, -1, -1, datetime.datetime.now()) counties.append(county) metric = attribute['Metric'] if metric == 'Cases': confirmed = int(attribute['Value']) county.confirmed = confirmed if metric == 'Deaths': deaths = int(attribute['Value']) county.deaths = deaths # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the CT Json file response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') jsonPayload = json.loads(response.text) counties = [] for item in jsonPayload: county_name = item['county'] confirmed = 0 if 'confirmedcases' in item: confirmed = int(item['confirmedcases']) hospitalizations = 0 if 'hospitalization' in item: hospitalizations = int(item['hospitalization']) deaths = 0 if 'confirmeddeaths' in item: deaths = int(item['confirmeddeaths']) county = findCounty(county_name, counties) if county == None: county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the data response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') csvData = response.text # read the in-memory string using the 'csv' module so we can iterate over each row csvReader = csv.reader(csvData.splitlines(), delimiter=',', quotechar='"') # create a list that will contain our county data counties = [] # iterate over every row in the CSV for row in csvReader: # skip the header row if row[0] == 'BOROUGH_GROUP': continue county_name = row[0] confirmed = int(row[4]) deaths = int(row[6]) hospitalizations = int(row[5]) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, hospitalizations, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the data response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') jsonPayload = json.loads(response.text) features = jsonPayload['features'] counties = [] for feature in features: attribute = feature['attributes'] county_name = attribute['NAME'] confirmed = int(attribute['CASES']) deaths = int(attribute['DEATHS']) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): counties = [] # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ browser = webdriver.Edge("msedgedriver.exe") browser.get(URL) try: counties_link = browser.find_element_by_id('open-counties-table-modal') counties_link.click() rootCountyDiv = browser.find_elements_by_class_name('counties-table') htmlRows = rootCountyDiv[0].find_elements_by_xpath(".//tbody/tr") rows = get_row_data(htmlRows) for row in rows: county_name = row[0] confirmed = int(row[1]) deaths = int(row[2]) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) except: print("Unexpected error:", sys.exc_info()[0]) browser.quit() # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport
def scraper(): # make an HTTP web request to get the AR data response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') table = bs4.BeautifulSoup(response.text, features="html.parser").select('table tr') counties = [] for i in range(1, 75): row = table[i].find_all('td') county_name = row[0].find('p').getText() confirmed = int(row[1].find('p').getText()) deaths = int(row[3].find('p').getText()) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): counties = [] # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ try: browser = webdriver.Edge("msedgedriver.exe") browser.get(URL) file_path = pathlib.Path.home().joinpath('Downloads', FILE_NAME) if os.path.isfile(file_path): print( " FAILED on ", STATE, " : Please delete ", file_path, " and start the process over. This file must not exist prior to running the scrape operation." ) download_link = WebDriverWait(browser, 30).until( EC.presence_of_element_located( (By.XPATH, '/html/body/div[2]/div[2]/div/div[1]/div/div[1]/a[2]'))) download_link.click() time.sleep(4) wb = openpyxl.load_workbook(filename=file_path) sheet = wb.worksheets[0] counties = [] max_rows = sheet.max_row for i in range(2, max_rows): rowCount = str(i) # print(rowCount) county_name = sheet['A' + rowCount].value if county_name == None or len(county_name) == 0: continue confirmed = sheet['B' + rowCount].value deaths = sheet['D' + rowCount].value county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) counties.append( county) # append the countyReport to our list of counties wb.close() except: print("Unexpected error:", sys.exc_info()[0]) browser.quit() os.remove(file_path) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport
def scraper(): # make an HTTP web request to get the MI XLSX file response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') temppath = 'temp' if not os.path.exists(temppath): os.makedirs(temppath) tempfilename = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") + '_temp_' + STATE_ABBR + '.xlsx' tempfilepath = pathlib.Path.cwd().joinpath('temp', tempfilename) with open(tempfilepath, "wb") as file: file.write(response.content) wb = openpyxl.load_workbook(filename=tempfilepath) sheet = wb.worksheets[0] max_rows = sheet.max_row counties = [] countyDictionary = {} i = max_rows while i > 2: rowCount = str(i) county_name = sheet['B' + rowCount].value county = findCounty(county_name, countyDictionary) if county == None: confirmed = int(sheet['E' + rowCount].value) deaths = int(sheet['P' + rowCount].value) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) # append the countyReport to our list of counties countyDictionary[county_name] = county i = i - 1 # since the above algorithm outputs the counties in reverse-ABC order, let's reverse that so they're in ABC order... counties = list(reversed(counties)) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): counties = [] # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ try: browser = webdriver.Edge("msedgedriver.exe") browser.get(URL) time.sleep(4) county_link = WebDriverWait(browser, 30).until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[1]/div[3]/div/article/div/div/div/ul[1]/li[1]/a' ))) county_link.click() time.sleep(4) all_link = WebDriverWait(browser, 30).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="pagin"]/li[12]/a'))) all_link.click() time.sleep(2) county_table = WebDriverWait(browser, 30).until( EC.presence_of_element_located( (By.XPATH, '/html/body/div[1]/div[3]/div/article/div/div/div/table/tbody' ))) time.sleep(2) # print(county_table) htmlRows = county_table.find_elements_by_xpath(".//tr") # print(htmlRows) rows = get_row_data(htmlRows) # print(rows) for row in rows: # print(row) county_name = row[0] if county_name == 'Illinois': continue confirmed = int(row[2]) deaths = int(row[3]) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) except: print("Unexpected error:", sys.exc_info()[0]) browser.quit() # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport
def scraper(): # make an HTTP web request to get the source information response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') soup = bs4.BeautifulSoup(response.text, features="html.parser") table = soup.find_all("table", attrs={"summary": "Cases by County"}) counties = [] for item in table[0].find_all('tr'): row = item.find_all('td') if len(row) == 0: continue county_name = row[0].text if county_name == 'Total': continue casesStr = row[1].text deathsStr = row[2].text if len( casesStr ) == 0 or casesStr == '' or casesStr == '\xa0' or casesStr == '\xa0\n\t': casesStr = '0' if len( deathsStr ) == 0 or casesStr == '' or deathsStr == '\xa0' or deathsStr == '\xa0\n\t': deathsStr = '0' confirmed = int(casesStr) deaths = int(deathsStr) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): # make an HTTP web request to get the file response = requests.get(URL) if response.status_code == requests.codes.ok: # Success - print to the console that the HTTP request succeeeded print(' ', STATE_ABBR, ': Downloaded succeeded') # Writing the XLSX to disk makes the loop below orders of magnitude faster # versus keeping the XLSX doc in-memory, so we create a temp folder and download # the file there. temppath = 'temp' if not os.path.exists(temppath): os.makedirs(temppath) tempfilename = datetime.datetime.now().strftime( "%Y-%m-%d_%H%M%S") + '_temp_' + STATE_ABBR + '.xlsx' tempfilepath = pathlib.Path.cwd().joinpath('temp', tempfilename) with open(tempfilepath, "wb") as file: file.write(response.content) wb = openpyxl.load_workbook(filename=tempfilepath) sheet = wb.worksheets[0] parishes = [] parishesDictionary = {} max_rows = sheet.max_row for i in range(2, max_rows): rowCount = str(i) # print(rowCount) parish_name = sheet['B' + rowCount].value confirmed = sheet['F' + rowCount].value parish = findParish(parish_name, parishesDictionary) if parish == None: parish = county_report.CountyReport(STATE, parish_name, (int)(confirmed), -1, -1, -1, datetime.datetime.now()) parishes.append(parish) parishesDictionary[parish_name] = parish else: parish.confirmed += int(confirmed) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(parishes), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, parishes, datetime.datetime.now()) # return the state-level report return stateReport else: # Fail print(' ', STATE_ABBR, ': ERROR : Web download failed - HTTP status code ', response.status_code)
def scraper(): counties = [] # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ browser = webdriver.Edge("msedgedriver.exe") browser.get(URL) file_path = pathlib.Path.home().joinpath( 'Downloads', 'Testing and Outcomes by County.csv') if os.path.isfile(file_path): print( " FAILED on ", STATE, " : Please delete ", file_path, " and start the process over. This file must not exist prior to running the scrape operation." ) download_link = browser.find_element_by_xpath( '/html/body/div[2]/div[3]/div[2]/div[1]/div[2]/div[5]') download_link.click() crosstab_link = WebDriverWait(browser, 30).until( EC.presence_of_element_located( (By.XPATH, '/html/body/div[6]/div/div/div/div/div[2]/div/button[3]'))) crosstab_link.click() counties_link = WebDriverWait(browser, 30).until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[7]/div/div/div/div/div[2]/div/div[1]/div[2]/div/div/div[2]/div/div/div' ))) counties_link.click() csv_link = WebDriverWait(browser, 30).until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[7]/div/div/div/div/div[2]/div/div[2]/div[2]/div/label[2]' ))) csv_link.click() download_button = WebDriverWait(browser, 30).until( EC.presence_of_element_located( (By.XPATH, '/html/body/div[7]/div/div/div/div/div[2]/div/div[3]/button'))) download_button.click() time.sleep(2) with open(file_path, 'rt', encoding='utf-16-le') as file_contents: data = file_contents.read() infile = StringIO(data) with open(file_path) as csv_file: csv_reader = csv.reader(data.splitlines(), delimiter='\t', quotechar='"') for row in csv_reader: # print(row) county_name = row[0] if county_name == 'County' or county_name == 'All' or row[ 1] == 'Cases per 100,000': continue confirmed = row[2].replace(',', '') if len(confirmed) == 0 or confirmed == '': confirmed = '0' deaths = row[3].replace(',', '') if len(deaths) == 0 or deaths == '': deaths = '0' county = county_report.CountyReport(STATE, county_name, (int)(confirmed), (int)(deaths), -1, -1, datetime.datetime.now()) counties.append(county) browser.quit() os.remove(file_path) # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport
def scraper(): counties = [] # You will need a WebDriver for Edge. See https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ try: browser = webdriver.Edge("msedgedriver.exe") browser.get(URL) time.sleep(1) show_table_link = WebDriverWait(browser, 30).until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[1]/div[1]/div[3]/div[3]/div/div/div[3]/div[2]/div/div[1]/div[2]/div[1]/div[1]/div/div[2]/span' ))) show_table_link.click() time.sleep(1) county_div = WebDriverWait(browser, 30).until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[1]/div[1]/div[3]/div[5]/div[4]/div[1]/div/div/div/div[1]/div/div/div[2]/div/div[2]/div' ))) county_div_rows = county_div.find_elements_by_xpath( './/div[@role="row"]') # SC puts its county level data into lots of <div> elements, with one <div> per county. Each <div> has its own single-row <table> that contains the county data. Thus, we # have some extra stuff to do to make this work right. for div_row in county_div_rows: county_table = div_row.find_element_by_xpath('.//table') htmlRows = county_table.find_elements_by_xpath(".//tr") rows = get_row_data(htmlRows) for row in rows: county_name = row[0] if county_name == 'Unknown': continue confirmed = int(row[3].replace(',', '')) deaths = int(row[4].replace(',', '')) county = county_report.CountyReport(STATE, county_name, confirmed, deaths, -1, -1, datetime.datetime.now()) counties.append(county) except: print("Unexpected error:", sys.exc_info()[0]) browser.quit() # print the number of counties we processed print(' ', STATE_ABBR, ':', len(counties), ' counties processed OK') # build the state-level report object that will include all of the counties stateReport = state_report.StateReport(STATE, STATE_ABBR, counties, datetime.datetime.now()) # return the state-level report return stateReport