def add_geograph_socioeconom(fixed_data_dict): 

	DB_indicator_ref = pd.ExcelFile("data/suplement_inputs/input_fixed.xlsx") #open the data base
	DB_ind = DB_indicator_ref.parse('Sheet1')
	idx = 0
	#initialize the new columns
	for j in DB_ind: # loop over each indicator
		if(idx > 0): # jump the name column of DB_ind
			fixed_data_dict[j] = []
		idx += 1
	# fill the new columns
	for name in fixed_data_dict["Name"]:
		idx = 0
		missingData = True
		for j in DB_ind:
			if(idx > 0):
				for i in range(len(DB_ind)): # search for 'name' row
					if strip_accents(DB_ind.iloc[i,0]).upper() == name:
						missingData = False
						fixed_data_dict[j].append(DB_ind.iloc[i,idx])
				if(missingData):
					fixed_data_dict[j].append(-1)
			idx += 1

	return fixed_data_dict
def get_additionalVariables(district_list):
    # get variable dict for each district
    DB = pd.ExcelFile(
        "data/suplement_inputs/A - DICIONÁRIO dos indicadores do Atlas.xlsx")
    DB = DB.parse("Plan1")  #choose a sheet and parse it...
    col_dict = dict([])
    for i in range(20, 247):
        col_dict[i - 20] = DB.iloc[i, 0]
    dist_var = dict([])
    #initialize dicts
    for dist in district_list:
        for var in col_dict.keys():
            dist_var[dist + str(var)] = []
    DB = pd.ExcelFile(
        "data/suplement_inputs/RM 62600 Recife - Base UDH 2000_2010.xlsx")
    DB = DB.parse("Sheet1")  #choose a sheet and parse it...
    #udh = DB.groupby(['Bairro', 'ANO', 'NOME_MUN'])
    #initialize dicts
    #for i in district_list
    for i in range(len(DB)):
        if (DB.iloc[i, 11] == 2010):  # if the year is 2010
            city_i = DB.iloc[i, 6]
            if city_i == 'Recife':  # if is a district
                dists_i = DB.iloc[i, 3].split("/")
                cnt = 0
                for dist in dists_i:  #loop over all districts
                    cnt += 1
                    clean_dist = dist
                    if (cnt == len(dists_i)):
                        if (dist[len(dist) - 1] == " "):
                            clean_dist = dist[0:len(dist) - 1]
                    for var in col_dict.keys():
                        val_tempp = DB.iloc[i, var + 12]
                        if np.isnan(val_tempp):
                            val_tempp = 0
                        val_ = dist_var[strip_accents(clean_dist).upper() +
                                        str(var)] + [val_tempp]

                        dist_var[strip_accents(clean_dist).upper() +
                                 str(var)] = val_
    return dist_var, col_dict
def get_codes():
	# read the file containing Recife's neighborhoods codes
	DB_PE = pd.read_csv("data/suplement_inputs/bairrosCod.csv", encoding = 'latin-1')
	col_raw = DB_PE['cod'] # get raw data...
	# initialize the code dict
	code_neighborhoods = dict([])
	for row in col_raw:
		row_split = row.split(',') # split this to get two columns: [code, name]
		name = row_split[1].replace('"','') # get name (key)
		code = row_split[0] # get code
		code_neighborhoods[strip_accents(name).upper()] = code

	return code_neighborhoods
def add_population(fixed_data_dict):

	DB = pd.ExcelFile("data/suplement_inputs/pop_bairros_Rec_2019.xlsx") #open the data base
	popDB = DB.parse("Plan1") #choose a sheet and parse it...
	fixed_data_dict["population_2019"] = [] # initialize the new column
	for name in fixed_data_dict["Name"]:
		missingData = True
		for i in range(len(popDB)):
			if strip_accents(popDB.iloc[i,0]).upper() == name:
				missingData = False
				orig = str(popDB.iloc[i,1])
				fixed_data_dict["population_2019"].append(int(orig.replace("\xa0","")))
		if(missingData):
			fixed_data_dict["population_2019"].append(-1)

	return fixed_data_dict
Example #5
0
def getAssocIdxs(listNames, _listNames_, matType):
    assocIdxs = []
    for i in listNames:
        i_detected = False
        count = 0
        for ii in _listNames_:
            if (strip_accents(ii).upper() == i):
                i_detected = True
                assocIdxs.append(count)
                break
            else:
                count += 1
        if not i_detected:
            print(i + " not found")

    return assocIdxs
def add_coords(fixed_data_dict):
	# source: inloco
	fixed_data_dict["lat"] = []
	fixed_data_dict["long"] = []
	DB = pd.read_csv("data/suplement_inputs/bairros_localizacao.csv", encoding='latin-1') #open the data base
	#print(DB)
	for name in fixed_data_dict["Name"]:
		hasValue = False
		for i in range(len(DB)):
			if strip_accents(DB.iloc[i,0]).upper() == name:
				fixed_data_dict["long"].append(float(DB.iloc[i,1]))
				fixed_data_dict["lat"].append(float(str(DB.iloc[i,2]).replace(";","")))
				hasValue = True
		if not hasValue:
			print(name + " coords not found")

	return fixed_data_dict
Example #7
0
def parseVotePDF(url):
    scrs = {}
    # Create a PDF interpreter object.
    laparams = pdfminer.layout.LAParams(word_margin=0.4, char_margin=3)

    content = requests.get(url).content
    fp = BytesIO(content)
    # Create a PDF parser object associated with the file object.
    txtfp = BytesIO()

    pdfminer.high_level.extract_text_to_fp(fp,
                                           outfp=txtfp,
                                           codec='utf-8',
                                           laparams=laparams)
    r = txtfp.getvalue().decode('utf8')

    import re
    scrutins = re.split(r'Analyse du scrutin[ n]+. *(\d+)', r)[1:]
    scrutins = [scrutins[x:x + 2] for x in xrange(0, len(scrutins), 2)]
    for noscrutin, rscrutin in scrutins:
        print url, noscrutin
        pages = re.split(r'Page \d+ sur \d+[ \n\r\x0c]+', rscrutin)
        synthese, pages = pages[0], strip_accents(''.join(pages[1:]))
        pages = re.split(r'Mises au point', pages) + ['']
        pages, miseaupoint = pages[0], pages[1:]
        pages = ''.join(re.split(r'[\w ,:]+\(\d+\) *\n', pages))
        pages = re.split(r'([\w\-\(\)]+) : (\d+)', pages)[1:]
        positions = [pages[x:x + 3] for x in xrange(0, len(pages), 3)]

        synthese = synthese.replace('\n', ' ').replace('  ', ' ')
        #noscrutin = re.search(r'Analyse du scrutin[ n]+. *(\d)',synthese).groups()[0]
        datestr = re.search(r's.ance du \w+ (\d+ [^ ]+ \d+)',
                            synthese).groups()[0]

        import locale
        locale.setlocale(locale.LC_ALL, 'fr_FR.utf8')
        from datetime import datetime
        date = datetime.strptime(datestr, "%d %B %Y")

        libelle = re.search(r'Scrutin public sur (.*). Demand. par :',
                            synthese)
        if libelle:
            libelle = libelle.groups()[0]
        else:
            libelle = re.search(r'Scrutin public sur (.*). Synth',
                                synthese).groups()[0]

        scrutin = {
            'num': int(noscrutin),
            'id': '%s_%s' % (legislature, noscrutin),
            'desc': libelle,
            'date': date.strftime('%d/%m/%Y'),
            'votes': {
                'pour': [],
                'contre': [],
                'abstention': [],
                'nonVotant': []
            }
        }

        pb = False
        avotes = {}
        for pos, nb, act in positions:
            act = act.split('\n\n')
            if len(act) > 1 and 'au moment du scrutin' in act[-1]:
                del act[-1]
            act = '\n'.join(act)

            act = act.replace(' et ', ',').replace(' et', ',').replace(
                '\net', ',').replace('\n',
                                     '').replace(' ',
                                                 '').replace(u'\u0153', 'oe')
            act = re.sub(r'\([^\)]+\)', r'', act).split(',')
            if int(nb) != len(act):
                print int(nb), len(act), "probleme"
                pb = True
            for a in act:
                avotes[normalize(a)] = votes[pos]
        if not pb:
            for a in avotes.keys():
                scrutin['votes'][avotes[a]].append(a)

            scrutin['ok'] = True
            scrs[noscrutin] = scrutin
        return scrs
def update_covid_data_bairros(code_neighborhoods, start_date):

    # 1 - Check the last date we have on the data repository
    print("\nchecking the last date in which the repository was updated...")
    lastDate = check_data_last_date(start_date)
    update_start_date = lastDate

    # 2 - After this checkup, start reading from PCR's database from google drive
    print("updating data up to " + str(date.today()))
    update_ = False  # assume that there is nothing to update...
    delta = timedelta(days=1)
    end_date = []
    dist_with_confirmed_cases = dict([])
    dist_uti = dict([])
    dist_enf = dict([])
    dist_death = dict([])
    while lastDate <= date.today():
        DB_PE = []
        try:
            date_formt = str(lastDate.strftime("%Y_%m_%d"))  # formatted date
            print("processing " + str(date_formt))
            DB_PE = pd.ExcelFile("data/PCR Dados Gonçalves/Base_Covid-19_" +
                                 date_formt + "_VF.xlsx")
        except FileNotFoundError:
            print(str(lastDate) + " file not found")
            return lastDate

        # update is needed...
        update_ = True
        # fix some district names in the raw data
        DB_PE = DB_PE.parse("Dados")  #choose a sheet and parse it...
        DB_PE = DB_PE.replace(
            {'NMBAIRRO': {
                'ILHA JOANA BEZERRA': 'JOANA BEZERRA'
            }})
        DB_PE = DB_PE.replace({'NMBAIRRO': {'RECIFE': 'BAIRRO DO RECIFE'}})
        DB_PE = DB_PE.replace(
            {'NMBAIRRO': {
                'ALTO SANTA TERESINHA': 'SANTA TEREZINHA'
            }})
        DB_PE = DB_PE.replace({'NMBAIRRO': {'PAU FERRO': 'PAU-FERRO'}})

        # 3 - collect new data and write consolidated information for each neighborhood
        # initialize the data base that will be filled with data from lastDate
        current_DB = dict([])
        current_DB["Name"] = []
        current_DB["Code"] = [
        ]  #[code_neighborhoods[nm] for nm in code_neighborhoods.keys()]
        current_DB["Active_cases"] = []

        # gather new information
        cities = DB_PE.groupby(['NMBAIRRO', 'CSTATUS'])
        for name, group in cities:
            if (name[1] == 'CONFIRMADO' and strip_accents(name[0]).upper()
                    in code_neighborhoods.keys()):
                total_active_cases = 0
                total_active_cases += np.sum(
                    sum([group['NMEVOLUCAO'] == 'ISOLAMENTO DOMICILIAR']))
                total_active_cases += np.sum(
                    sum([
                        group['NMEVOLUCAO'] == 'INTERNADO LEITO DE ISOLAMENTO'
                    ]))
                #dist_enf[strip_accents(name[0]).upper() + str(start_date)] = np.sum(sum([group['NMEVOLUCAO'] == 'INTERNADO LEITO DE ISOLAMENTO']))
                total_active_cases += np.sum(
                    sum([group['NMEVOLUCAO'] == 'INTERNADO UTI']))
                #dist_uti[strip_accents(name[0]).upper() + str(start_date)] = np.sum(sum([group['NMEVOLUCAO'] == 'INTERNADO UTI']))
                total_active_cases += np.sum(
                    sum([
                        group['NMEVOLUCAO'] ==
                        'INTERNADO, MAS NÃO ESTÁ EM LEITO DE ISOLAMENTO'
                    ]))
                #dist_with_confirmed_cases[strip_accents(name[0]).upper() + str(start_date)] = total_active_cases
                #dist_death[strip_accents(name[0]).upper() + str(start_date)] = np.sum(sum([group['NMEVOLUCAO'] == 'ÓBITO']))

                # add a row in the data base
                if strip_accents(name[0]).upper() in code_neighborhoods.keys():
                    current_DB["Name"].append(strip_accents(name[0]).upper())
                    current_DB["Code"].append(code_neighborhoods[strip_accents(
                        name[0]).upper()])
                    current_DB["Active_cases"].append(total_active_cases)

        #write the new data base
        df = pd.DataFrame(current_DB)
        df.to_excel("data/data_repo/pcr_data_" + str(lastDate) + ".xlsx")

        lastDate += delta  # next date