def update_datasets(self): os.makedirs(self.path, exist_ok=True) chamber_of_deputies = Dataset(self.path) chamber_of_deputies.fetch() chamber_of_deputies.translate() chamber_of_deputies.clean() fetch(self.COMPANIES_DATASET, self.path)
def update_datasets(self): os.makedirs(self.path, exist_ok=True) ceap = CEAPDataset(self.path) ceap.fetch() ceap.convert_to_csv() ceap.translate() ceap.clean() fetch(self.COMPANIES_DATASET, self.path)
def test_fetch(self, datasets): fetch('file.xz', 'test') datasets.assert_called_once_with('test') datasets.return_value.downloader.download.assert_called_once_with( 'file.xz')
# coding: utf-8 # # Um mês depois do primeiro mutirão # # https://datasciencebr.com/um-m%C3%AAs-depois-do-primeiro-mutir%C3%A3o-369975af4bb5 # In[1]: import numpy as np import pandas as pd from serenata_toolbox.datasets import fetch fetch('2016-12-06-reimbursements.xz', '../data') reimbursements = pd.read_csv('../data/2016-12-06-reimbursements.xz', dtype={ 'document_number': np.str, 'year': np.str }, low_memory=False) # In[2]: import os.path import urllib.request import zipfile inbox_url = 'https://github.com/datasciencebr/serenata-de-amor-inbox/archive/master.zip' inbox_filepath = '/tmp/master.zip' if not os.path.exists(inbox_filepath): urllib.request.urlretrieve(inbox_url, inbox_filepath)
def test_fetch(self, datasets): fetch('file.xz', 'test') datasets.assert_called_once_with('test') datasets.return_value.downloader.download.assert_called_once_with('file.xz')
'DF': 'distrito_federal'} census_link = "ftp.ibge.gov.br/Censos/Censo_Demografico_2010/resultados/total_populacao_{}.zip" # ## Gathering cities with @cuducos Brazilian Cities script # # @cuducos had already made a script with all Brazilian Cities and its code and state associated, here in [this repository](https://github.com/cuducos/brazilian-cities). # # We checked and it is the best way to get the cities in the right way. # In[10]: from serenata_toolbox.datasets import fetch fetch('2017-05-22-brazilian-cities.csv', '../data') # In[11]: brazilian_cities = pd.read_csv('../data/2017-05-22-brazilian-cities.csv') brazilian_cities.head() # In[12]: brazilian_cities.shape # ## Normalizing its form #
# * Where expenses with a total net value equal or higher than 100 BRL # * In which congresspeople from the 2015 term have expend public money # # The set of cities was taken [random sample that sounded promosing](https://twitter.com/cuducos/status/840882495868530688)… but hold your horses: further analysis is disapointing… let's get started. # In[1]: import numpy as np import pandas as pd from serenata_toolbox.datasets import fetch DTYPE = dict(cnpj=np.str, cnpj_cpf=np.str) # In[2]: fetch('2017-04-21-sex-place-distances.xz', '../data') # In[3]: companies = pd.read_csv('../data/2016-09-03-companies.xz', dtype=DTYPE, low_memory=False) companies.cnpj = companies.cnpj.str.replace(r'\D', '') companies.shape # In[4]: sex_places = pd.read_csv('../data/2017-04-21-sex-place-distances.xz', dtype=DTYPE) sex_places.shape
def update_companies(self): self.log.info('Updating companies') os.makedirs(self.path, exist_ok=True) fetch(self.COMPANIES_DATASET, self.path)
# Note: remember to correct prices with an inflation index (e.g. IPCA). # In[1]: get_ipython().magic('matplotlib inline') import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) plt.rcParams['figure.figsize'] = (20, 10) # In[2]: from serenata_toolbox.datasets import fetch fetch('2016-11-19-reimbursements.xz', '../data') fetch('2016-09-03-companies.xz', '../data') fetch('2016-11-29-yelp-companies.xz', '../data') fetch('2016-12-02-foursquare-companies.xz', '../data') # In[3]: import numpy as np import pandas as pd dataset = pd.read_csv('../data/2016-11-19-reimbursements.xz', dtype={ 'applicant_id': np.str, 'cnpj_cpf': np.str, 'congressperson_id': np.str, 'subquota_number': np.str
# coding: utf-8 # # Invalid CNPJ or CPF from Federal Senate CEAP # # `cnpj_cpf` is the column identifying the company or individual who received the payment made by the congressperson. Having this value empty should mean that it's an expense made outside Brazil, with a company (or person) without a Brazilian ID. # In[1]: import numpy as np import pandas as pd from serenata_toolbox.datasets import fetch fetch('2017-05-22-federal-senate-reimbursements.xz', '../data/') # In[2]: dataset = pd.read_csv('../data/2017-05-22-federal-senate-reimbursements.xz', converters={'cnpj_cpf': np.str}, encoding='utf-8') # In[3]: dataset = dataset[dataset['cnpj_cpf'].notnull()] dataset.head() # In[4]: from pycpfcnpj import cpfcnpj
# coding: utf-8 # # Expenses in closed companies # Recently we found out that there are many companies that are already closed or out of service, we are aiming to find if there are expenses made after the company situation as other than open. # In[1]: import pandas as pd import numpy as np from serenata_toolbox.datasets import fetch fetch('2016-09-03-companies.xz', '../data') fetch('2016-11-19-reimbursements.xz', '../data') # In[2]: companies = pd.read_csv('../data/2016-09-03-companies.xz', low_memory=False) reimbursements = pd.read_csv('../data/2016-11-19-reimbursements.xz', dtype={ 'applicant_id': np.str, 'cnpj_cpf': np.str, 'congressperson_id': np.str, 'subquota_number': np.str }, low_memory=False) # ## Formatting # Formatting companies situation_date and reimbursements issue_date columns to correct date format (will be needed for a query later), and formatting the companies cpnj to a format without dash and dots. # In[3]:
def fetch(self): datasets.fetch(self.COMPANIES_FILE, self.data_path) datasets.fetch(self.CONGRESSPEOPLE_FILE, self.data_path) datasets.fetch(self.SOCIAL_ACCOUNTS_FILE, self.data_path)