Exemple #1
0
MODELS_DIR = os.path.join(BASE_DIR, 'models')

OUT_BASE_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
OUT_DATA_DIR = os.path.join(OUT_BASE_DIR, 'upload_olist-master', 'data')
DB_PATH = os.path.join(OUT_DATA_DIR, 'olist.db')

shutil.copyfile(os.path.join(TRAIN_DIR, 'etl.sql'),
                os.path.join(PREDICT_DIR, 'etl.sql'))
query = utils.import_query(os.path.join(PREDICT_DIR, 'etl.sql'))

print('\nImportando modelo...')
model = pd.read_pickle(os.path.join(MODELS_DIR, 'models.pkl'))
print('Pronto.')

print('\nAbrindo conexão...')
conn = utils.connect_db('sqlite', path=DB_PATH)
print('Pronto.')

print('\nFazendo ETL...')
query = query.format(date=args.date, stage='PREDICT')
utils.execute_many_sql(query, conn)
df = pd.read_sql_table('PRE_ABT_PREDICT_CHURN', conn)
print('Pronto.')

print('\nRealizando predições...')
df['churn_prob'] = model['models'].predict_proba(df[model['features']])[:, 1]
print('Pronto.')

print('\nSalvando base escora...')
table = df[['churn_prob', 'seller_id']]
if args.export == 'sqlite':
Exemple #2
0
import os
import pandas as pd
import sqlalchemy
from olistlib.db import utils

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_DIR = os.path.join(BASE_DIR, 'data')

files_names = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]

connection = utils.connect_db('mysql', os.path.join(BASE_DIR, '.env'))

for i in files_names:
    print(i)
    df_tmp = pd.read_csv(os.path.join(DATA_DIR, i))
    table_name = "tb_" + i.strip(".csv").replace("olist_", "").replace(
        "_dataset", "")
    df_tmp.to_sql(table_name,
                  connection,
                  schema='Olist',
                  if_exists='replace',
                  index=False)
Exemple #3
0
parser.add_argument("--database",
                    help='Nome do bando de dados',
                    choices=['sqlite', 'sql'],
                    default='sqlite')
args = parser.parse_args()

# Os endereços de nosso projeto e sub pastas
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_DIR = os.path.join(BASE_DIR, 'data')

# Encontrando os arquivos de dados
files_names = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]

# Abrindo conexão com banco...
connection = utils.connect_db(args.database,
                              os.path.join(BASE_DIR, '.env'),
                              path=os.path.join(DATA_DIR, 'olist.db'))

# Para cada arquivo é realizado uma inserção no banco
for i in files_names:
    print(i)
    df_tmp = pd.read_csv(os.path.join(DATA_DIR, i))
    table_name = "tb_" + i.strip(".csv").replace("olist_", "").replace(
        "_dataset", "")
    if args.database == 'sqlite':
        df_tmp.to_sql(table_name, connection, if_exists='replace', index=False)

    elif args.database == 'sql':
        df_tmp.to_sql(table_name,
                      connection,
                      schema='olist',
Exemple #4
0
args = parser.parse_args()

# Folders do projeto
TRAIN_DIR = os.path.dirname( os.path.abspath(__file__) )
DATA_PREP_DIR = os.path.dirname( TRAIN_DIR )
SRC_DIR = os.path.dirname( DATA_PREP_DIR )
BASE_DIR = os.path.dirname( SRC_DIR )
DATA_DIR = os.path.join(BASE_DIR, 'data')
DB_PATH = os.path.join( os.path.dirname( BASE_DIR ), 'upload_olist', 'data', 'olist.db' )

# Buscando data de incio da extração
date_init = datetime.datetime.strptime(args.date_end, "%Y-%m-%d") - dateutils.relativedelta(months=args.delta)
date_init = date_init.strftime( "%Y-%m-%d" )

# Abrindo conexão com o banco
con  = utils.connect_db('sqlite', path=DB_PATH) # Abre a conexao com o banco....

# Calculo da quantidade maxima de dias para revenda
days_between = datetime.datetime.strptime( args.date_end, "%Y-%m-%d" ) - datetime.datetime.strptime( date_init, "%Y-%m-%d" )
days_between = days_between.days

print("\n\n Criando tabela axuliar...")
# identificação de todas possíveis categorias
categories_query = utils.import_query( os.path.join(TRAIN_DIR, 'categories.sql') )
categories = pd.read_sql_query( categories_query, con )['categories'].tolist()

# criação de tabela com info de dia a dia por catetoria
df_days = pd.DataFrame()
for c in categories:
    df_tmp = pd.DataFrame( {"days": list(range(1, days_between+1)),
                            'category': [c] * days_between  } )
Exemple #5
0
date_end = args.date_end

ano = int(date_end.split('-')[0]) - 1
mes = int(date_end.split('-')[1])

date_init = f'{ano}-{mes}-01'

# Importa a Query
query = utils.import_query(os.path.join(DATA_PREP_DIR, 'sgmt.sql'))

query = query.format(date_init = date_init,
                     date_end = date_end)

#Abrindo conexão com o banco usando lib SQLite3
conn = utils.connect_db()

# #Abrindo conexão com o banco usando lib SqlAlchemy
conn_alchemy = utils.connect_db_alchemy()

create_query = f'''
CREATE TABLE tb_seller_sgmt AS 
{query}
;'''

insert_query = f'''
DELETE FROM tb_seller_sgmt WHERE DT_SGMT = '{date_end}';
INSERT INTO tb_seller_sgmt 
{query};'''

try:
datas = [
    '2017-01-01',
    '2017-02-01',
    '2017-03-01',
    '2017-04-01',
    '2017-05-01',
    '2017-06-01',
    '2017-07-01',
    '2017-08-01',
    '2017-09-01',
]

conn = utils.connect_db(
    'sqlite',
    path=
    '/Users/igorfreitascruz/Documents/Ciência de dados/olist_project/upload_olist-master/data/olist.db'
)
dfs = []
for data in datas:
    query_formatada = query.format(date=data)
    df_tmp = pd.read_sql_query(query_formatada, conn)
    dfs.append(df_tmp)

abt = pd.concat(dfs, axis=0, ignore_index=True)
abt['flag_churn'].mean()

# Definição das variáveis
target = 'flag_churn'
features = abt.columns[3:-2]
Exemple #7
0
                    choices=['sqlite', 'sql'],
                    default='sqlite')
args = parser.parse_args()

date_end = args.date_end
date_init = datetime.datetime.strptime(
    args.date_end, "%Y-%m-%d") - dateutils.relativedelta(years=1)
date_init = date_init.strftime("%Y-%m-%d")

# Importa a query
query = utils.import_query(os.path.join(DATA_PREP_DIR, 'segmentos.sql'))
query = query.format(date_init=date_init, date_end=date_end)

if args.database == 'sqlite':
    query = query.replace('olist.', '')
    table_name = 'tb_seller_sgmt'

elif args.database == 'sql':
    table_name = 'olist.tb_seller_sgmt'

# Abrindo conexão com banco...
conn = utils.connect_db(args.database, path=DB_PATH)

try:
    create_query = f'''CREATE TABLE {table_name} AS\n{query};'''
    utils.execute_many_sql(create_query, conn)

except:
    insert_query = f'''DELETE FROM {table_name} WHERE DT_SGMT = '{date_end}';
    INSERT INTO {table_name} {query};'''
    utils.execute_many_sql(insert_query, conn, verbose=True)
Exemple #8
0
import os
import pandas as pd
import sqlalchemy

from olistlib.db import utils

# enderecos e sub pastas do projeto
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_DIR = os.path.join(BASE_DIR, 'data')

# list comprehension
files_names = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]

# abrindo conexao com banco
connection = utils.connect_db('mariadb', os.path.join(BASE_DIR, '.env'))

# Para cada arquivo e realizado uma insercao no banco
for i in files_names:
    df_tmp = pd.read_csv(os.path.join(DATA_DIR, i))
    table_name = "tb_" + i.strip(".csv").replace("olist_", "").replace(
        "_dataset", "")
    df_tmp.to_sql(table_name,
                  connection,
                  schema='olist',
                  if_exists='replace',
                  index=False)