MODELS_DIR = os.path.join(BASE_DIR, 'models') OUT_BASE_DIR = os.path.dirname(os.path.dirname(BASE_DIR)) OUT_DATA_DIR = os.path.join(OUT_BASE_DIR, 'upload_olist-master', 'data') DB_PATH = os.path.join(OUT_DATA_DIR, 'olist.db') shutil.copyfile(os.path.join(TRAIN_DIR, 'etl.sql'), os.path.join(PREDICT_DIR, 'etl.sql')) query = utils.import_query(os.path.join(PREDICT_DIR, 'etl.sql')) print('\nImportando modelo...') model = pd.read_pickle(os.path.join(MODELS_DIR, 'models.pkl')) print('Pronto.') print('\nAbrindo conexão...') conn = utils.connect_db('sqlite', path=DB_PATH) print('Pronto.') print('\nFazendo ETL...') query = query.format(date=args.date, stage='PREDICT') utils.execute_many_sql(query, conn) df = pd.read_sql_table('PRE_ABT_PREDICT_CHURN', conn) print('Pronto.') print('\nRealizando predições...') df['churn_prob'] = model['models'].predict_proba(df[model['features']])[:, 1] print('Pronto.') print('\nSalvando base escora...') table = df[['churn_prob', 'seller_id']] if args.export == 'sqlite':
import os import pandas as pd import sqlalchemy from olistlib.db import utils BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_DIR = os.path.join(BASE_DIR, 'data') files_names = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')] connection = utils.connect_db('mysql', os.path.join(BASE_DIR, '.env')) for i in files_names: print(i) df_tmp = pd.read_csv(os.path.join(DATA_DIR, i)) table_name = "tb_" + i.strip(".csv").replace("olist_", "").replace( "_dataset", "") df_tmp.to_sql(table_name, connection, schema='Olist', if_exists='replace', index=False)
parser.add_argument("--database", help='Nome do bando de dados', choices=['sqlite', 'sql'], default='sqlite') args = parser.parse_args() # Os endereços de nosso projeto e sub pastas BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_DIR = os.path.join(BASE_DIR, 'data') # Encontrando os arquivos de dados files_names = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')] # Abrindo conexão com banco... connection = utils.connect_db(args.database, os.path.join(BASE_DIR, '.env'), path=os.path.join(DATA_DIR, 'olist.db')) # Para cada arquivo é realizado uma inserção no banco for i in files_names: print(i) df_tmp = pd.read_csv(os.path.join(DATA_DIR, i)) table_name = "tb_" + i.strip(".csv").replace("olist_", "").replace( "_dataset", "") if args.database == 'sqlite': df_tmp.to_sql(table_name, connection, if_exists='replace', index=False) elif args.database == 'sql': df_tmp.to_sql(table_name, connection, schema='olist',
args = parser.parse_args() # Folders do projeto TRAIN_DIR = os.path.dirname( os.path.abspath(__file__) ) DATA_PREP_DIR = os.path.dirname( TRAIN_DIR ) SRC_DIR = os.path.dirname( DATA_PREP_DIR ) BASE_DIR = os.path.dirname( SRC_DIR ) DATA_DIR = os.path.join(BASE_DIR, 'data') DB_PATH = os.path.join( os.path.dirname( BASE_DIR ), 'upload_olist', 'data', 'olist.db' ) # Buscando data de incio da extração date_init = datetime.datetime.strptime(args.date_end, "%Y-%m-%d") - dateutils.relativedelta(months=args.delta) date_init = date_init.strftime( "%Y-%m-%d" ) # Abrindo conexão com o banco con = utils.connect_db('sqlite', path=DB_PATH) # Abre a conexao com o banco.... # Calculo da quantidade maxima de dias para revenda days_between = datetime.datetime.strptime( args.date_end, "%Y-%m-%d" ) - datetime.datetime.strptime( date_init, "%Y-%m-%d" ) days_between = days_between.days print("\n\n Criando tabela axuliar...") # identificação de todas possíveis categorias categories_query = utils.import_query( os.path.join(TRAIN_DIR, 'categories.sql') ) categories = pd.read_sql_query( categories_query, con )['categories'].tolist() # criação de tabela com info de dia a dia por catetoria df_days = pd.DataFrame() for c in categories: df_tmp = pd.DataFrame( {"days": list(range(1, days_between+1)), 'category': [c] * days_between } )
date_end = args.date_end ano = int(date_end.split('-')[0]) - 1 mes = int(date_end.split('-')[1]) date_init = f'{ano}-{mes}-01' # Importa a Query query = utils.import_query(os.path.join(DATA_PREP_DIR, 'sgmt.sql')) query = query.format(date_init = date_init, date_end = date_end) #Abrindo conexão com o banco usando lib SQLite3 conn = utils.connect_db() # #Abrindo conexão com o banco usando lib SqlAlchemy conn_alchemy = utils.connect_db_alchemy() create_query = f''' CREATE TABLE tb_seller_sgmt AS {query} ;''' insert_query = f''' DELETE FROM tb_seller_sgmt WHERE DT_SGMT = '{date_end}'; INSERT INTO tb_seller_sgmt {query};''' try:
datas = [ '2017-01-01', '2017-02-01', '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01', '2017-07-01', '2017-08-01', '2017-09-01', ] conn = utils.connect_db( 'sqlite', path= '/Users/igorfreitascruz/Documents/Ciência de dados/olist_project/upload_olist-master/data/olist.db' ) dfs = [] for data in datas: query_formatada = query.format(date=data) df_tmp = pd.read_sql_query(query_formatada, conn) dfs.append(df_tmp) abt = pd.concat(dfs, axis=0, ignore_index=True) abt['flag_churn'].mean() # Definição das variáveis target = 'flag_churn' features = abt.columns[3:-2]
choices=['sqlite', 'sql'], default='sqlite') args = parser.parse_args() date_end = args.date_end date_init = datetime.datetime.strptime( args.date_end, "%Y-%m-%d") - dateutils.relativedelta(years=1) date_init = date_init.strftime("%Y-%m-%d") # Importa a query query = utils.import_query(os.path.join(DATA_PREP_DIR, 'segmentos.sql')) query = query.format(date_init=date_init, date_end=date_end) if args.database == 'sqlite': query = query.replace('olist.', '') table_name = 'tb_seller_sgmt' elif args.database == 'sql': table_name = 'olist.tb_seller_sgmt' # Abrindo conexão com banco... conn = utils.connect_db(args.database, path=DB_PATH) try: create_query = f'''CREATE TABLE {table_name} AS\n{query};''' utils.execute_many_sql(create_query, conn) except: insert_query = f'''DELETE FROM {table_name} WHERE DT_SGMT = '{date_end}'; INSERT INTO {table_name} {query};''' utils.execute_many_sql(insert_query, conn, verbose=True)
import os import pandas as pd import sqlalchemy from olistlib.db import utils # enderecos e sub pastas do projeto BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_DIR = os.path.join(BASE_DIR, 'data') # list comprehension files_names = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')] # abrindo conexao com banco connection = utils.connect_db('mariadb', os.path.join(BASE_DIR, '.env')) # Para cada arquivo e realizado uma insercao no banco for i in files_names: df_tmp = pd.read_csv(os.path.join(DATA_DIR, i)) table_name = "tb_" + i.strip(".csv").replace("olist_", "").replace( "_dataset", "") df_tmp.to_sql(table_name, connection, schema='olist', if_exists='replace', index=False)