def add_geograph_socioeconom(fixed_data_dict): DB_indicator_ref = pd.ExcelFile("data/suplement_inputs/input_fixed.xlsx") #open the data base DB_ind = DB_indicator_ref.parse('Sheet1') idx = 0 #initialize the new columns for j in DB_ind: # loop over each indicator if(idx > 0): # jump the name column of DB_ind fixed_data_dict[j] = [] idx += 1 # fill the new columns for name in fixed_data_dict["Name"]: idx = 0 missingData = True for j in DB_ind: if(idx > 0): for i in range(len(DB_ind)): # search for 'name' row if strip_accents(DB_ind.iloc[i,0]).upper() == name: missingData = False fixed_data_dict[j].append(DB_ind.iloc[i,idx]) if(missingData): fixed_data_dict[j].append(-1) idx += 1 return fixed_data_dict
def get_additionalVariables(district_list): # get variable dict for each district DB = pd.ExcelFile( "data/suplement_inputs/A - DICIONÁRIO dos indicadores do Atlas.xlsx") DB = DB.parse("Plan1") #choose a sheet and parse it... col_dict = dict([]) for i in range(20, 247): col_dict[i - 20] = DB.iloc[i, 0] dist_var = dict([]) #initialize dicts for dist in district_list: for var in col_dict.keys(): dist_var[dist + str(var)] = [] DB = pd.ExcelFile( "data/suplement_inputs/RM 62600 Recife - Base UDH 2000_2010.xlsx") DB = DB.parse("Sheet1") #choose a sheet and parse it... #udh = DB.groupby(['Bairro', 'ANO', 'NOME_MUN']) #initialize dicts #for i in district_list for i in range(len(DB)): if (DB.iloc[i, 11] == 2010): # if the year is 2010 city_i = DB.iloc[i, 6] if city_i == 'Recife': # if is a district dists_i = DB.iloc[i, 3].split("/") cnt = 0 for dist in dists_i: #loop over all districts cnt += 1 clean_dist = dist if (cnt == len(dists_i)): if (dist[len(dist) - 1] == " "): clean_dist = dist[0:len(dist) - 1] for var in col_dict.keys(): val_tempp = DB.iloc[i, var + 12] if np.isnan(val_tempp): val_tempp = 0 val_ = dist_var[strip_accents(clean_dist).upper() + str(var)] + [val_tempp] dist_var[strip_accents(clean_dist).upper() + str(var)] = val_ return dist_var, col_dict
def get_codes(): # read the file containing Recife's neighborhoods codes DB_PE = pd.read_csv("data/suplement_inputs/bairrosCod.csv", encoding = 'latin-1') col_raw = DB_PE['cod'] # get raw data... # initialize the code dict code_neighborhoods = dict([]) for row in col_raw: row_split = row.split(',') # split this to get two columns: [code, name] name = row_split[1].replace('"','') # get name (key) code = row_split[0] # get code code_neighborhoods[strip_accents(name).upper()] = code return code_neighborhoods
def add_population(fixed_data_dict): DB = pd.ExcelFile("data/suplement_inputs/pop_bairros_Rec_2019.xlsx") #open the data base popDB = DB.parse("Plan1") #choose a sheet and parse it... fixed_data_dict["population_2019"] = [] # initialize the new column for name in fixed_data_dict["Name"]: missingData = True for i in range(len(popDB)): if strip_accents(popDB.iloc[i,0]).upper() == name: missingData = False orig = str(popDB.iloc[i,1]) fixed_data_dict["population_2019"].append(int(orig.replace("\xa0",""))) if(missingData): fixed_data_dict["population_2019"].append(-1) return fixed_data_dict
def getAssocIdxs(listNames, _listNames_, matType): assocIdxs = [] for i in listNames: i_detected = False count = 0 for ii in _listNames_: if (strip_accents(ii).upper() == i): i_detected = True assocIdxs.append(count) break else: count += 1 if not i_detected: print(i + " not found") return assocIdxs
def add_coords(fixed_data_dict): # source: inloco fixed_data_dict["lat"] = [] fixed_data_dict["long"] = [] DB = pd.read_csv("data/suplement_inputs/bairros_localizacao.csv", encoding='latin-1') #open the data base #print(DB) for name in fixed_data_dict["Name"]: hasValue = False for i in range(len(DB)): if strip_accents(DB.iloc[i,0]).upper() == name: fixed_data_dict["long"].append(float(DB.iloc[i,1])) fixed_data_dict["lat"].append(float(str(DB.iloc[i,2]).replace(";",""))) hasValue = True if not hasValue: print(name + " coords not found") return fixed_data_dict
def parseVotePDF(url): scrs = {} # Create a PDF interpreter object. laparams = pdfminer.layout.LAParams(word_margin=0.4, char_margin=3) content = requests.get(url).content fp = BytesIO(content) # Create a PDF parser object associated with the file object. txtfp = BytesIO() pdfminer.high_level.extract_text_to_fp(fp, outfp=txtfp, codec='utf-8', laparams=laparams) r = txtfp.getvalue().decode('utf8') import re scrutins = re.split(r'Analyse du scrutin[ n]+. *(\d+)', r)[1:] scrutins = [scrutins[x:x + 2] for x in xrange(0, len(scrutins), 2)] for noscrutin, rscrutin in scrutins: print url, noscrutin pages = re.split(r'Page \d+ sur \d+[ \n\r\x0c]+', rscrutin) synthese, pages = pages[0], strip_accents(''.join(pages[1:])) pages = re.split(r'Mises au point', pages) + [''] pages, miseaupoint = pages[0], pages[1:] pages = ''.join(re.split(r'[\w ,:]+\(\d+\) *\n', pages)) pages = re.split(r'([\w\-\(\)]+) : (\d+)', pages)[1:] positions = [pages[x:x + 3] for x in xrange(0, len(pages), 3)] synthese = synthese.replace('\n', ' ').replace(' ', ' ') #noscrutin = re.search(r'Analyse du scrutin[ n]+. *(\d)',synthese).groups()[0] datestr = re.search(r's.ance du \w+ (\d+ [^ ]+ \d+)', synthese).groups()[0] import locale locale.setlocale(locale.LC_ALL, 'fr_FR.utf8') from datetime import datetime date = datetime.strptime(datestr, "%d %B %Y") libelle = re.search(r'Scrutin public sur (.*). Demand. par :', synthese) if libelle: libelle = libelle.groups()[0] else: libelle = re.search(r'Scrutin public sur (.*). Synth', synthese).groups()[0] scrutin = { 'num': int(noscrutin), 'id': '%s_%s' % (legislature, noscrutin), 'desc': libelle, 'date': date.strftime('%d/%m/%Y'), 'votes': { 'pour': [], 'contre': [], 'abstention': [], 'nonVotant': [] } } pb = False avotes = {} for pos, nb, act in positions: act = act.split('\n\n') if len(act) > 1 and 'au moment du scrutin' in act[-1]: del act[-1] act = '\n'.join(act) act = act.replace(' et ', ',').replace(' et', ',').replace( '\net', ',').replace('\n', '').replace(' ', '').replace(u'\u0153', 'oe') act = re.sub(r'\([^\)]+\)', r'', act).split(',') if int(nb) != len(act): print int(nb), len(act), "probleme" pb = True for a in act: avotes[normalize(a)] = votes[pos] if not pb: for a in avotes.keys(): scrutin['votes'][avotes[a]].append(a) scrutin['ok'] = True scrs[noscrutin] = scrutin return scrs
def update_covid_data_bairros(code_neighborhoods, start_date): # 1 - Check the last date we have on the data repository print("\nchecking the last date in which the repository was updated...") lastDate = check_data_last_date(start_date) update_start_date = lastDate # 2 - After this checkup, start reading from PCR's database from google drive print("updating data up to " + str(date.today())) update_ = False # assume that there is nothing to update... delta = timedelta(days=1) end_date = [] dist_with_confirmed_cases = dict([]) dist_uti = dict([]) dist_enf = dict([]) dist_death = dict([]) while lastDate <= date.today(): DB_PE = [] try: date_formt = str(lastDate.strftime("%Y_%m_%d")) # formatted date print("processing " + str(date_formt)) DB_PE = pd.ExcelFile("data/PCR Dados Gonçalves/Base_Covid-19_" + date_formt + "_VF.xlsx") except FileNotFoundError: print(str(lastDate) + " file not found") return lastDate # update is needed... update_ = True # fix some district names in the raw data DB_PE = DB_PE.parse("Dados") #choose a sheet and parse it... DB_PE = DB_PE.replace( {'NMBAIRRO': { 'ILHA JOANA BEZERRA': 'JOANA BEZERRA' }}) DB_PE = DB_PE.replace({'NMBAIRRO': {'RECIFE': 'BAIRRO DO RECIFE'}}) DB_PE = DB_PE.replace( {'NMBAIRRO': { 'ALTO SANTA TERESINHA': 'SANTA TEREZINHA' }}) DB_PE = DB_PE.replace({'NMBAIRRO': {'PAU FERRO': 'PAU-FERRO'}}) # 3 - collect new data and write consolidated information for each neighborhood # initialize the data base that will be filled with data from lastDate current_DB = dict([]) current_DB["Name"] = [] current_DB["Code"] = [ ] #[code_neighborhoods[nm] for nm in code_neighborhoods.keys()] current_DB["Active_cases"] = [] # gather new information cities = DB_PE.groupby(['NMBAIRRO', 'CSTATUS']) for name, group in cities: if (name[1] == 'CONFIRMADO' and strip_accents(name[0]).upper() in code_neighborhoods.keys()): total_active_cases = 0 total_active_cases += np.sum( sum([group['NMEVOLUCAO'] == 'ISOLAMENTO DOMICILIAR'])) total_active_cases += np.sum( sum([ group['NMEVOLUCAO'] == 'INTERNADO LEITO DE ISOLAMENTO' ])) #dist_enf[strip_accents(name[0]).upper() + str(start_date)] = np.sum(sum([group['NMEVOLUCAO'] == 'INTERNADO LEITO DE ISOLAMENTO'])) total_active_cases += np.sum( sum([group['NMEVOLUCAO'] == 'INTERNADO UTI'])) #dist_uti[strip_accents(name[0]).upper() + str(start_date)] = np.sum(sum([group['NMEVOLUCAO'] == 'INTERNADO UTI'])) total_active_cases += np.sum( sum([ group['NMEVOLUCAO'] == 'INTERNADO, MAS NÃO ESTÁ EM LEITO DE ISOLAMENTO' ])) #dist_with_confirmed_cases[strip_accents(name[0]).upper() + str(start_date)] = total_active_cases #dist_death[strip_accents(name[0]).upper() + str(start_date)] = np.sum(sum([group['NMEVOLUCAO'] == 'ÓBITO'])) # add a row in the data base if strip_accents(name[0]).upper() in code_neighborhoods.keys(): current_DB["Name"].append(strip_accents(name[0]).upper()) current_DB["Code"].append(code_neighborhoods[strip_accents( name[0]).upper()]) current_DB["Active_cases"].append(total_active_cases) #write the new data base df = pd.DataFrame(current_DB) df.to_excel("data/data_repo/pcr_data_" + str(lastDate) + ".xlsx") lastDate += delta # next date