def _download_data_from_sql(data_version='final_data', recache=False): from app.b_data_cleaning import get_dataset_registry sql_table_name = get_dataset_registry()[data_version]['sql_table'] query = "SELECT * FROM {}".format(sql_table_name) param_dic = my.get_credentials(credential='aws_databases')['aws'] cache_folder = os.path.join(my.get_project_directories(key='cache_dir'), 'raw_data') data_file = os.path.join(cache_folder, (data_version + '.csv')) if not os.path.exists(cache_folder): os.makedirs(cache_folder) if recache or not os.path.exists(data_file): print('Getting raw data via sql...') with my.postgresql_connect(param_dic) as conn: df = pd.read_sql_query(query, con=conn) obj_cols = df.select_dtypes(include='object').columns df[obj_cols] = df[obj_cols].astype(str) df.to_csv(data_file, index=False) with open(data_file[:-4] + '.dtypes', 'wb') as f: dtypes = df.dtypes.to_dict() dtypes = dict( zip(dtypes.keys(), [ str if i == np.object else i for i in dtypes.values() ])) pickle.dump(dtypes, f) print('Raw data cached.') else: print('Raw data already cached.') with open(data_file[:-4] + '.dtypes', 'rb') as f: dtypes = pickle.load(f) df = pd.read_csv(data_file, dtype=dtypes, index_col=False) if data_version == 'handpicked_dataset': app_dir = my.get_project_directories(key='app_dir') file_path = os.path.join(app_dir, 'a_get_data', 'reuters_eikon', 'key_reuters_fields.csv') data_dict = pd.read_csv(file_path) data_dict['Clear Name'] = data_dict['Clear Name'].str.lower() data_dict = data_dict.set_index('Clear Name') new_data_dict = data_dict[['Data Type', 'Variable Type']].to_dict(orient='index') fillnan_cols = [] formula_methods = [] for col in data_dict.columns.tolist(): if col[:8] == 'fillnan_': fillnan_cols.append(col) fillnan_cols = sorted(fillnan_cols, key=str.lower) for index, row in data_dict[fillnan_cols].iterrows(): tmp = row.tolist() tmp = [x for x in tmp if str(x) != 'nan'] new_data_dict[index]['Fill NaN Rules'] = tmp for j in [ i.split(':')[1] for i in tmp if i.split(':')[0] == 'formula' ]: formula_methods.append((index, j)) else: new_data_dict = None formula_methods = None return df, data_file, new_data_dict, formula_methods
info_text = text_file.read() df = pd.read_csv(cache_file) print(info_text) return df if __name__ == '__main__': # Working directory must be the higher .../app folder from app.z_helpers import helpers as my my.convenience_settings() dataset_name = 'handpicked_dataset' from app.b_data_cleaning import get_dataset_registry dataset_props = get_dataset_registry()[dataset_name] recache_raw_data = False redo_cleaning = False comp_col = dataset_props['company_col'] time_cols = dataset_props['iter_cols'] industry_col = dataset_props['industry_col'] drop_row_if_col_not_filled_before_filling = ['sales', 'eps'] drop_row_if_col_not_filled_after_filling = ['ebit'] df = get_clean_data(data_version=dataset_name, recache_raw_data=recache_raw_data, redo_data_cleaning=redo_cleaning, comp_col=comp_col, time_cols=time_cols,