Python fetch Examples, serenata_toolbox.datasets.fetch Python Examples

Example #1

0

Show file

File: adapter.py Project: tutuca/serenata-de-amor

 def update_datasets(self):
     os.makedirs(self.path, exist_ok=True)
     chamber_of_deputies = Dataset(self.path)
     chamber_of_deputies.fetch()
     chamber_of_deputies.translate()
     chamber_of_deputies.clean()
     fetch(self.COMPANIES_DATASET, self.path)

Example #2

0

Show file

 def update_datasets(self):
     os.makedirs(self.path, exist_ok=True)
     ceap = CEAPDataset(self.path)
     ceap.fetch()
     ceap.convert_to_csv()
     ceap.translate()
     ceap.clean()
     fetch(self.COMPANIES_DATASET, self.path)

Example #3

0

Show file

 def test_fetch(self, datasets):
     fetch('file.xz', 'test')
     datasets.assert_called_once_with('test')
     datasets.return_value.downloader.download.assert_called_once_with(
         'file.xz')

Example #4

0

Show file

# coding: utf-8

# # Um mês depois do primeiro mutirão
#
# https://datasciencebr.com/um-m%C3%AAs-depois-do-primeiro-mutir%C3%A3o-369975af4bb5

# In[1]:

import numpy as np
import pandas as pd
from serenata_toolbox.datasets import fetch

fetch('2016-12-06-reimbursements.xz', '../data')
reimbursements = pd.read_csv('../data/2016-12-06-reimbursements.xz',
                             dtype={
                                 'document_number': np.str,
                                 'year': np.str
                             },
                             low_memory=False)

# In[2]:

import os.path
import urllib.request
import zipfile

inbox_url = 'https://github.com/datasciencebr/serenata-de-amor-inbox/archive/master.zip'
inbox_filepath = '/tmp/master.zip'
if not os.path.exists(inbox_filepath):
    urllib.request.urlretrieve(inbox_url, inbox_filepath)

Example #5

0

Show file

File: test_datasets.py Project: darkmoonzika/serenata-toolbox

 def test_fetch(self, datasets):
     fetch('file.xz', 'test')
     datasets.assert_called_once_with('test')
     datasets.return_value.downloader.download.assert_called_once_with('file.xz')

Example #6

0

Show file

                    'DF': 'distrito_federal'}

census_link = "ftp.ibge.gov.br/Censos/Censo_Demografico_2010/resultados/total_populacao_{}.zip"


# ## Gathering cities with @cuducos Brazilian Cities script
# 
# @cuducos had already made a script with all Brazilian Cities and its code and state associated, here in [this repository](https://github.com/cuducos/brazilian-cities).
# 
# We checked and it is the best way to get the cities in the right way.

# In[10]:

from serenata_toolbox.datasets import fetch

fetch('2017-05-22-brazilian-cities.csv', '../data')


# In[11]:

brazilian_cities = pd.read_csv('../data/2017-05-22-brazilian-cities.csv')
brazilian_cities.head()


# In[12]:

brazilian_cities.shape


# ## Normalizing its form
#

Example #7

0

Show file

File: 2017-04-21-cuducos-explore-sex-places-dataset.py Project: vladimiralencar/serenata-de-amor

# * Where expenses with a total net value equal or higher than 100 BRL
# * In which congresspeople from the 2015 term have expend public money
#
# The set of cities was taken [random sample that sounded promosing](https://twitter.com/cuducos/status/840882495868530688)… but hold your horses: further analysis is disapointing… let's get started.

# In[1]:

import numpy as np
import pandas as pd
from serenata_toolbox.datasets import fetch

DTYPE = dict(cnpj=np.str, cnpj_cpf=np.str)

# In[2]:

fetch('2017-04-21-sex-place-distances.xz', '../data')

# In[3]:

companies = pd.read_csv('../data/2016-09-03-companies.xz',
                        dtype=DTYPE,
                        low_memory=False)
companies.cnpj = companies.cnpj.str.replace(r'\D', '')
companies.shape

# In[4]:

sex_places = pd.read_csv('../data/2017-04-21-sex-place-distances.xz',
                         dtype=DTYPE)
sex_places.shape

Example #8

0

Show file

 def update_companies(self):
     self.log.info('Updating companies')
     os.makedirs(self.path, exist_ok=True)
     fetch(self.COMPANIES_DATASET, self.path)

Example #9

0

Show file

File: 2016-12-01-irio-anomalies-in-meal-prices.py Project: 00921/serenata-de-amor

# Note: remember to correct prices with an inflation index (e.g. IPCA).

# In[1]:

get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

plt.rcParams['figure.figsize'] = (20, 10)

# In[2]:

from serenata_toolbox.datasets import fetch

fetch('2016-11-19-reimbursements.xz', '../data')
fetch('2016-09-03-companies.xz', '../data')
fetch('2016-11-29-yelp-companies.xz', '../data')
fetch('2016-12-02-foursquare-companies.xz', '../data')

# In[3]:

import numpy as np
import pandas as pd

dataset = pd.read_csv('../data/2016-11-19-reimbursements.xz',
                      dtype={
                          'applicant_id': np.str,
                          'cnpj_cpf': np.str,
                          'congressperson_id': np.str,
                          'subquota_number': np.str

Example #10

0

Show file

# coding: utf-8

# # Invalid CNPJ or CPF from Federal Senate CEAP
#
# `cnpj_cpf` is the column identifying the company or individual who received the payment made by the congressperson. Having this value empty should mean that it's an expense made outside Brazil, with a company (or person) without a Brazilian ID.

# In[1]:

import numpy as np
import pandas as pd

from serenata_toolbox.datasets import fetch

fetch('2017-05-22-federal-senate-reimbursements.xz', '../data/')

# In[2]:

dataset = pd.read_csv('../data/2017-05-22-federal-senate-reimbursements.xz',
                      converters={'cnpj_cpf': np.str},
                      encoding='utf-8')

# In[3]:

dataset = dataset[dataset['cnpj_cpf'].notnull()]
dataset.head()

# In[4]:

from pycpfcnpj import cpfcnpj

Example #11

0

Show file

File: adapter.py Project: datasciencebr/serenata-de-amor

 def update_companies(self):
     self.log.info('Updating companies')
     os.makedirs(self.path, exist_ok=True)
     fetch(self.COMPANIES_DATASET, self.path)

Example #12

0

Show file

File: 2017-02-20-anaschwendler-expenses-in-closed-companies.py Project: vladimiralencar/serenata-de-amor

# coding: utf-8

# # Expenses in closed companies
# Recently we found out that there are many companies that are already closed or out of service, we are aiming to find if there are expenses made after the company situation as other than open.

# In[1]:

import pandas as pd
import numpy as np
from serenata_toolbox.datasets import fetch

fetch('2016-09-03-companies.xz', '../data')
fetch('2016-11-19-reimbursements.xz', '../data')

# In[2]:

companies = pd.read_csv('../data/2016-09-03-companies.xz', low_memory=False)
reimbursements = pd.read_csv('../data/2016-11-19-reimbursements.xz',
                             dtype={
                                 'applicant_id': np.str,
                                 'cnpj_cpf': np.str,
                                 'congressperson_id': np.str,
                                 'subquota_number': np.str
                             },
                             low_memory=False)

# ## Formatting
# Formatting companies situation_date and reimbursements issue_date columns to correct date format (will be needed for a query later), and formatting the companies cpnj to a format without dash and dots.

# In[3]:

Example #13

0

Show file

 def fetch(self):
     datasets.fetch(self.COMPANIES_FILE, self.data_path)
     datasets.fetch(self.CONGRESSPEOPLE_FILE, self.data_path)
     datasets.fetch(self.SOCIAL_ACCOUNTS_FILE, self.data_path)