def copy_production_schema_to_sqlite(production_path): con_sqlite = sqlite3.connect(production_path + 'features.db') with connect_to_database().connect() as con: metadata = sqlalchemy.MetaData(bind=con, schema='production') metadata.reflect() for schema, table in map(lambda x: x.split('.'), metadata.tables.keys()): content = pd.read_sql_table(table, schema=schema, con=connect_to_database()) content.to_sql(table, con=con_sqlite, if_exists='replace', index=False)
def load_documents_to_database(file_path, method, table, schema, how='append'): """ Load csv file with text characteristics into DB. The csv is expected to be the output of text extraction, with the columns: filename, is_extractable, number_of_pages Parameters ---------- file_path: string Complete path to csv file to load. method: string Method used to extract the text: 'pdfminer', 'pypdf', 'tika'. table: string Name of the target table to upload data. schema: string Name of the schema where the target table is. how: string In case the table already exists, what should happen: 'fail', 'replace', 'append' (default). """ con = connect_to_database() documents_df = pd.read_csv(file_path, delimiter=',') documents_df['method'] = method load_df = join_filename_with_id(df=documents_df) load_pandas_to_db(df=load_df, table=table, schema=schema, how=how)
def create_labels(args): """ Function to obtain a dataframe of labels from experiment file corresponding to cohort Parameters ---------- experiment: dict Experiment file with model parameters Return --------- pd.DataFrame Dataframe of IDs and labels """ experiment = get_experiment(args['experiment_id']) features = get_local(args, 'features')['id_llamado'] query =""" select distinct labels.id_llamado as id_llamado, tipo_procedimiento_codigo, labels.reception_date, {label_target} as target from semantic.labels labels join semantic.tenders tenders on labels.id_llamado = tenders.id_llamado where labels.id_llamado in ({cohort}) """.format(cohort=experiment['cohort_config']['query'], label_target=experiment['label_config']['query']) con = utils.connect_to_database() labels = pd.read_sql_query(query, con) labels = labels[labels['id_llamado'].isin(features)] persist_local(labels, args, 'labels')
def get_data_from_db(query, as_pandas=False): """ Gets query result from database Parameters ---------- query: string SQL query as_pandas: bool Either if you want the result as a pd.DataFrame Return ------ sqlalchemy.engine.result.ResultProxy or pd.DataFrame Content of the query """ with utils.connect_to_database().connect() as con: result = con.execute(query) if as_pandas: return pd.DataFrame(result, columns=result.keys()) return result
def create_production_schema_postgresql(): to_production_sql = Path( os.path.abspath('joaoc-experiment-checks') ).parent.parent.parent / 'sql' / 'production' / 'create-production-schema.sql' with connect_to_database().connect() as con: for query in to_production_sql.open('r').read().split(';')[:-1]: res = con.execute(query)
def create_features(args): """ Function to obtain features specified in the experiment file. Function will loop over all the features. Parameters: ------------ experiment: dict Experiment file with model parameters Return: ------------ pd.DataFrame A dataframe of features corresponding to each cohort """ experiment = get_experiment(args['experiment_id']) query_config = """with cd_tenders as ( {cohort} ) select cd_tenders.id_llamado, {columns} from cd_tenders left join {table} as feature_table on cd_tenders.id_llamado = feature_table.id_llamado """ con = utils.connect_to_database() features_combined = pd.DataFrame() for feature_config in experiment['features']: query = query_config.format(cohort=experiment['cohort_config']['query'], columns=','.join( feature_config['columns']), table=feature_config['table']) features = pd.read_sql_query(query, con) if features_combined.empty: features_combined = features else: features_combined = features_combined.merge( features, on='id_llamado', how='inner') # print(features_combined.columns) features_combined = features_combined.dropna() persist_local(features_combined, args, 'features')
def fetch_data(experiment_id): con = utils.connect_to_database() query = f""" select a.*, b.name from experiments.evaluations a inner join experiments.approaches b on a.experiment_id = b.experiment_id and a.approach_id = b.approach_id where a.experiment_id = {experiment_id} """ return pd.read_sql_query(query, con)[[ 'learner_id', 'fold', 'eval_metric', 'score', 'experiment_id', 'approach_id', 'name' ]]
def get_file_dict(id_llamado): """ Obtain a dictionary of full filepaths given the input IDs Parameters ---------- id_llamado : list list of IDs to obtain the full filepaths Returns ------- file_path_dict : dictionary A dictionary of IDs as keys and their corresponding full path """ con = connect_to_database() query = """ select id_llamado, filename, method from semantic.documents where is_extractable = true and method = 'tika' """ # Load full list of all id_llamados and their corresponding filenames full_list = pd.read_sql_query(query, con) #Get the subset based on id_llamados subset_list = full_list[full_list['id_llamado'].isin(id_llamado)] #Change the filename to .txt instead of .pdf or .PDF if len(subset_list) != 0: subset_list['filename'] = subset_list.apply( lambda x: x['filename'].lower().replace('.pdf', '') + '.txt', axis=1) else: print('Empty dataframe') # Generate the full path subset_list['fullpath'] = document_path + \ subset_list['method'] + '/' + subset_list['filename'] # Extract full path and id_llamado file_path_dict = {} file_path_dict['id_llamado'] = subset_list['id_llamado'].tolist() file_path_dict['fullpath'] = subset_list['fullpath'].tolist() return file_path_dict
def insert_to_db(data, schema, table, how='append'): """ Interts dictionary as row in the corresponding table in the database Parameters ---------- data: dictonary Dict with keys being the columns of the table schema: string Schema name table: string Table name TODO: how: string Returns ------- dictonary Status of the task """ engine = utils.connect_to_database() con = engine.connect() metadata = sqlalchemy.MetaData(bind=con, schema=schema) metadata.reflect() if isinstance(data, dict): data = [data] # insertions = [] # for datum in data: # insertions.append(metadata.tables[f'{schema}.{table}'].insert().values( # datum # )) try: metadata.tables[f'{schema}.{table}'].insert().execute(data) except sqlalchemy.exc.IntegrityError: pass con.close() engine.dispose()
def prepare_to_selection(selected_learners, configuration): con = connect_to_database() selected_data = get_data(selected_learners) bias = download_bias_data(query=configuration['query'], con=con) bias = filter_bias_data(bias, configuration['groups']) for data in selected_data: data['labels'] = label_data(data['results']) data['bias'] = add_bias(data['labels'], bias) data['bias']['model_id'] = data['learner_id'] data['aequitas'] = get_aequitas(data['bias'], ) melted_aequitas = melt_aequitas(selected_data) return melted_aequitas, selected_data
def fetch_data(selector_config): con = connect_to_database() query = f""" select evaluation.*, approach.name from (select * from experiments.evaluations where experiment_id = {selector_config['experiment_id']}) evaluation left join experiments.approaches approach on evaluation.experiment_id = approach.experiment_id and evaluation.approach_id = approach.approach_id """ data = pd.read_sql_query(query, con)[[ 'learner_id', 'fold', 'name', 'eval_metric', 'score', 'experiment_id', 'approach_id' ]] return data
def do_plots(experiment_id): # Get data on experiment results from database con = utils.connect_to_database() query = """ select evaluation.*,approach.name from experiments.evaluations evaluation left join experiments.approaches approach on evaluation.approach_id = approach.approach_id """ df = pd.read_sql_query(query, con) # Subselect data on specific experiment id data = df.loc[df['experiment_id'] == experiment_id] # Set of colors to be used in the plot n = len(data['learner_id']) color = iter(cm.rainbow(np.linspace(0, 1, n))) # Set font size plt.rcParams.update({'font.size': 14}) # Loop to create one fig per metric and a line per learner for metric in data['eval_metric'].unique(): fig, ax1 = plt.subplots(figsize=(15, 8)) ax1.set_title(f"Metric: {metric}") ax1.set_ylabel('score') # check if it is k-fold or temporal-fold if '-' in data['fold'].iloc[0]: ax1.set_xlabel('time') plt.xticks(rotation=90) else: ax1.set_xlabel('fold') for approach in data['approach_id'].unique(): c = next(color) for learner in data['learner_id'].unique(): data_to_plot = data[(data['learner_id'] == learner) & (data['approach_id'] == approach) & (data['eval_metric'] == metric)] approach_name = data_to_plot['name'].unique() ax1.plot(data_to_plot['fold'], data_to_plot['score'], c=c) ax1.legend(approach_name) persist_local(data=fig, args={ 'experiment_id': experiment_id, 'eval_metric': metric }, folder='evaluation_plots', id_keys=['experiment_id', 'eval_metric'], as_type='.png')
# + import seaborn as sns fontsize = 20 # General fontsize def get_index_of_line(line, df, forbiden='forest'): line = [i for i in df.unique() if ((line in i) & (forbiden not in i))][0] return [i.get_text() for i in axis.lines[0].axes.get_legend().texts].index(line) - 1 # - con = connect_to_database() # Add here your parameters: macro_experiment_id = 3481383696 args = dict(experiment_id=macro_experiment_id) query = f""" select * from experiments.errors where experiment_id in ( select experiment_id from experiments.parallelization where macro_experiment_id = {macro_experiment_id}) order by created_on desc """
def list_documents_to_extract(output_file): """ Get the list of documents (complete path) from which to extract text and characteristics. Parameters ---------- output_file: string Output file where previous runs are stored. Returns ------- docs_to_check: list List of documents. """ docs_to_check = [] checked_files = set(check_or_create_output_file(output_file)) con = connect_to_database() # Retrieve all documents from DB query = f"select id_llamado, nombre_archivo as filename" \ f" from raw.pbc_adenda" \ f" where lower(tipo_documento) != 'adenda'" \ f" and right(lower(nombre_archivo), 3) = 'pdf'" all_documents = pd.read_sql_query(query, con) # List document folders first_pbc = sorted([ file for file in os.listdir(first_pbc_path) if file.lower().endswith('.pdf') ]) all_pbc = sorted([ file for file in os.listdir(all_pbc_path) if file.lower().endswith('.pdf') ]) all_pbc_2 = sorted([ file for file in os.listdir(more_pbc_path) if file.lower().endswith('.pdf') ]) set_first_pbc = set(first_pbc) set_all_pbc = set(all_pbc + all_pbc_2) ids = all_documents['id_llamado'].unique() for i in tqdm(range(len(ids))): id = ids[i] docs_id = all_documents[all_documents['id_llamado'] == id].filename.tolist() set_docs = set(docs_id) # First, check if there are documents in the first PBC folder intersect_first = list(set_docs & set_first_pbc) len_intersect = len(intersect_first) # If there is more than one document in first PBC, there is a problem if len_intersect > 1: logging.error(f'{id} has {len_intersect} documents in first PBC.') continue # If there is one document in first PBC, we add it to the output list elif len_intersect == 1: curr_doc = intersect_first[0] if curr_doc in checked_files: continue else: docs_to_check.append(f"{first_pbc_path}/{curr_doc}") else: intersect_all = list(set_docs & set_all_pbc) # If there is no document in first PBC, but only one document existing, we add it if len(intersect_all) == 1: curr_doc = intersect_all[0] if curr_doc in checked_files: continue else: if curr_doc in all_pbc: docs_to_check.append(f"{all_pbc_path}/{curr_doc}") else: docs_to_check.append(f"{more_pbc_path}/{curr_doc}") # If there is no document in first PBC and more than one in the table, we have a problem else: logging.error( f'{id} has {len(intersect_all)} documents in PBC folders.') continue return sorted(list(set(docs_to_check)))
def generate_prioritized_risk_list(data_path, n_top_features=3, save_csv=True, output_folder=''): """ Given a csv file path containing tenders minimum information, prioritize them according to their risk and flag those that should be reviewing thoroughly (flagged as quality_review=1). Parameters ----------- data_path: string Complete path of the csv file where the tenders information is. The minimum information required is: - id_llamado - convocante - tipo_entidad - tipo_procedimiento_codigo - categoria - _objeto_licitacion - monto_global_estimado_planificacion - _moneda - reception_date n_top_features: int Number of top features wanted. By default, the top 3 features will be displayed. save_csv: bool Save output to csv. By default, the output csv will be saved. output_folder: string Folder where to output the csv file. By default, the root path. Returns ------- csv_df: DataFrame Prioritized dataframe. """ # Load arguments of the best model args = pickle.load(open(f"{production_path}/best_model_args.p", 'rb')) # Connect to DB con = connect_to_database() # Load data original_df = pd.read_csv(data_path) # Read data and prepare it to start the ML pipeline features_df = generate_features_from_new_data(original_df, args, con) # Load encoders (model and preprocessor) model = load_encoder(args, type='model') preprocessor = load_encoder(args, type='preprocessing') # Apply preprocessing features, _ = preprocessing.run(preprocessors=list(preprocessor), data=features_df, preprocessing=preprocessor, fit=False) # Predict risk scores scores = [p[1] for p in model.predict_proba(features)] # Get important features for each individual tender features_list = get_feature_importance(model, features, n_top_features) # Define final output csv file csv_df = original_df.copy() csv_df['complaint_risk'] = scores csv_df = pd.concat( [csv_df, pd.DataFrame(features_list).add_prefix('risk_factor_')], axis=1) # Sort by risk score csv_df = csv_df.sort_values(by=[ 'complaint_risk' ], ascending=False).drop(columns=['complaint_risk']).reset_index(drop=True) # Flag quality reviews, according to best model k% n_flags = max(1, round(args['k'] * csv_df.shape[0])) csv_df['quality_review'] = [1] * n_flags + [0 ] * (csv_df.shape[0] - n_flags) if save_csv: csv_df.to_csv( f'{output_folder}/{dt.date.today()}_prioritized_tenders.csv', index=False, header=True) return csv_df