def check_and_compare_version(external_version: Optional[str] = str() ) -> Tuple[bool, str]: """Checks what the currently installed version of sheetwork is and compares it to the one on PyPI. This requires an internet connection. In the case where this doesn't happen a URLError will probably be thrown and in that case we just return False not to cause annoying user experience. Args: external_version (Optional[str], optional): Mainly for testing purposes. Defaults to str(). Returns: bool: True when sheetwork needs an update. False when good. """ try: pypi_version: str = luddite.get_version_pypi("sheetwork") if external_version: installed_version = external_version else: installed_version = __version__ needs_update = semver_parse(pypi_version) > semver_parse( installed_version) if needs_update: logger.warning( yellow( f"Looks like you're a bit behind. A newer version of Sheetwork v{pypi_version} is available." )) return needs_update, pypi_version except URLError: return False, str()
def read_profile(self): logger.debug(f"Profile Name: {self.profile_name}") filename = Path(self.profile_dir, "profiles.yml") if filename.exists(): yaml_dict = open_yaml(filename) is_valid_yaml = validate_yaml(yaml_dict, profiles_schema) profile = yaml_dict["profiles"].get(self.profile_name) if profile: # set target name from profile unless one was given at init from flags parse. if not self.target_name: self.target_name = profile.get("target") if profile.get("outputs"): target_profile = profile["outputs"].get(self.target_name) if target_profile and is_valid_yaml: is_valid_profile = self._validate_profile(target_profile) if is_valid_profile: self.profile_dict = target_profile else: raise ProfileParserError( f"Error finding and entry for target: {self.target_name}, " f"under the {self.profile_name} profile.") else: raise ProfileParserError( f"Could not find an entry for {self.profile_name} in your profile.yml" ) else: raise FileNotFoundError( f"Could not open or find {filename.resolve()} check that it exists" )
def _override_gspread_default_creds(self) -> None: """Temporary workaround to allow `gspread.oauth()` to look for credentials in another location. For more info: https://github.com/burnash/gspread/issues/826 This will likely be removed if work on gspread #826 gets carried out. """ logger.debug( "Overriding `gspread`'s DEFAULT_AUTHORISED_USER_FILENAME and stuff. " "This is temporary (hopefully) see `GoogleSpreadsheet._override_gspread_default_creds()` " "docstring for more info.") logger.debug( f"Overriding to: {self._profile.google_credentials_dir}/{self._profile.profile_name}" ) gspread.auth.DEFAULT_CONFIG_DIR = Path( self._profile.google_credentials_dir) gspread.auth.DEFAULT_CREDENTIALS_FILENAME = gspread.auth.DEFAULT_CONFIG_DIR.joinpath( self._profile.profile_name).with_suffix(self.CREDS_EXT) gspread.auth.DEFAULT_AUTHORIZED_USER_FILENAME = gspread.auth.DEFAULT_CONFIG_DIR.joinpath( f"{self._profile.profile_name}_authorised_user").with_suffix( self.CREDS_EXT) gspread.auth.DEFAULT_SERVICE_ACCOUNT_FILENAME = gspread.auth.DEFAULT_CONFIG_DIR.joinpath( f"{self._profile.profile_name}_service_account").with_suffix( self.CREDS_EXT) # doing this skipping for when I'm testing this function gspread.auth.load_credentials.__defaults__ = ( gspread.auth.DEFAULT_AUTHORIZED_USER_FILENAME, ) gspread.auth.store_credentials.__defaults__ = ( gspread.auth.DEFAULT_AUTHORIZED_USER_FILENAME, "token", )
def show_complete(self): credentials_message = TO_DO_CREDENTIALS.format( to_do_credentials=TO_DO_CREDENTIALS, open_cmd=open_dir_cmd(), profiles_path=self.profiles_path, profiles_doc_url=PROFILE_DOC_URL, google_creds_doc_url=GOOGLE_CREDS_DOC_URL, project_name=self.project_name, ) if self.project_dir_is_created: done_message = INIT_DONE.format( project_name=self.project_name, project_path=self.project_path, profiles_path=self.profiles_path, google_path=self.google_path, profiles_doc_url=PROFILE_DOC_URL, google_creds_doc_url=GOOGLE_CREDS_DOC_URL, project_doc_url=PROJECT_DOC_URL, sheets_config_doc_url=SHEETS_CONFIG_DOC_URL, to_do_credentials=credentials_message, open_cmd=open_dir_cmd(), ) else: done_message = CREDENTIALS_ONLY_SUCCESS_MESSAGE.format( to_do_credentials=credentials_message ) logger.info(green(done_message))
def find_nearest_dir_and_file( self, yaml_file: str, current: Path = Path.cwd()) -> Tuple[Path, Path]: """Looks for the yaml_file you ask for. Starting from the current directory and going up with recursion while the iteration number is still within the max allowed. Args: yaml_file (str): Name and extension of the file to find. current (Path, optional): Path() objects from which to start. Defaults to Path.cwd(). Raises: NearestFileNotFound: When no file that matches the required name can be found. Returns: Tuple[Path, Path]: The directory up to the file name, and the full path to the filename, respectively. Maybe we'll end up deprecating one of these returns down the line but for now it's handy. """ filename = Path(current, yaml_file) while self.iteration < self.max_iter: logger.debug(f"Looking for {filename}") if filename.exists(): project_dir = filename.parent logger.debug(f"{filename} exists and was returned") return project_dir, filename current = current.parent filename = Path(current, yaml_file) self.iteration += 1 else: raise NearestFileNotFound( f"Unable to find {yaml_file} in the nearby directories after {self.max_iter} " "iterations upwards.")
def make_dir(path: "Path"): """Creates a directory. Args: path (Path): Where you want it to be. """ logger.debug(f"Making folder: {path}") path.mkdir()
def run(self): self.load_sheet() if self.push_anyway: self.push_sheet() self.check_table() else: logger.info( yellow("Nothing pushed since you were in --dry_run mode."))
def authenticate(self) -> None: if self.is_service_account: logger.debug("Using SERVICE_ACCOUNT auth") self.google_client = gspread.service_account(self.creds_path) else: logger.debug("Using END_USER auth") # ! This override should be temporary ideally we'll have a more long term solution in: # ! https://github.com/burnash/gspread/issues/826 self._override_gspread_default_creds() self.google_client = gspread.oauth() self.is_authenticated = True
def make_file(path: "Path", contents: str = str()): """Creates a text file with potential things in it. WOW! Args: path (Path): Where you want it to be contents (str, optional): What you want to put in that text file. Defaults to str(). """ logger.debug(f"Making file: {path}") path.touch() if contents: with path.open("w", encoding="utf-8") as f: f.write(contents)
def create_google_dir_and_file(self): self.google_path = self.profiles_path / "google" google_file = self.google_path / f"{self.project_name}.json" if not self.google_path.exists(): make_dir(self.google_path) else: logger.debug(f"{self.google_path} already exists.") if not google_file.exists(): make_file(google_file) else: logger.debug(f"{google_file} already exists.")
def run_cleanup(self, df: pandas.DataFrame) -> Tuple[bool, pandas.DataFrame]: clean_up = True # check for interactive mode if self.flags.interactive: logger.info( yellow( "PRE-CLEANING PREVIEW: The DataFrame you would push to the database would look like this:" )) self._show_dry_run_preview(df) clean_up = self._collect_and_check_answer() if clean_up is True: logger.debug("Performing clean ups") clean_df = SheetCleaner( df, bool(self.config.sheet_config.get("snake_case_camel", False))).cleanup() if self.flags.dry_run or self.flags.interactive: logger.info(yellow("\nPOST-CLEANING PREVIEW:")) self._show_dry_run_preview(clean_df) carry_on = self._collect_and_check_answer(post_cleanup=True) if not carry_on: logger.info(timed_message(red("User Aborted."))) sys.exit(1) return True, clean_df return True, df
def push_sheet(self): logger.info(timed_message("Pushing sheet to database...")) logger.debug( f"Column override dict is a {type(self.config.sheet_columns)}") logger.debug(f"Sheet columns: {self.config.sheet_columns}") logger.debug(f"Columns in final df: {self.sheet_df.columns.tolist()}") self.sql_adapter.upload(self.sheet_df, self.target_schema)
def create_project_dir(self): project_dir = self.project_path / f"{self.project_name}" if not project_dir.exists(): make_dir(project_dir) self.project_dir_is_created = True elif self.flags.force_credentials: logger.warn(f"{self.project_name} already exists, moving on to credential files.") else: raise ProjectIsAlreadyCreated( f"""\n {self.project_name} already exists, so we'll stop. If you created it by mistake, delete it and run this again. If you want to generate the profiles and credentials files only use --force-credentials-folders CLI arguments (see help for more info). """ )
def _create_schema(self) -> None: if self._has_connection is False: raise NoAcquiredConnectionError( f"No acquired connection for {type(self).__name__}. Make sure you call " "`acquire_connection` before.") try: if self.config.project.object_creation_dct["create_schema"]: schema_exists = (True if self.config.target_schema in self.con.dialect.get_schema_names( self.con) else False) if schema_exists is False: logger.debug( yellow( f"Creating schema: {self.config.target_schema} in {self._database}" )) self.con.execute(CreateSchema(self.config.target_schema)) except Exception as e: raise DatabaseError(str(e))
def _collect_and_check_answer(post_cleanup: bool = False): acceptable_answers = ["y", "n", "a"] user_input = str() while user_input not in acceptable_answers: if user_input is not None: logger.info("Choose 'y':yes, 'n':no, 'a':abort'") if post_cleanup: user_input = input("Would you like to push to db? (y/n):") else: user_input = input( "Would you like to perform cleanup? (y/n/a): ") if user_input.lower() == "y": return True if user_input.lower() == "n": return False if user_input.lower() == "a": logger.info(red("User aborted.")) sys.exit(1)
def __init__(self, project: Project, target_name: str = str()): """Profile constructor. Mainly just needs an initted Project object. Args: project (Project): initted project object target_name (str, optional): Mainly used in unit testing if you want to override the project name. Pretty useless in all other practice cases I think. Defaults to str(). """ self.profile_name = project.project_name self.target_name = target_name self.profile_dict: Dict[str, str] = dict() self.cannot_be_none = {"db_type", "guser"} self.profile_dir: Path = project.profile_dir self.google_credentials_dir = Path(project.profile_dir, "google").resolve() self.read_profile() logger.debug(f"PROFILE_DIR {self.profile_dir}") logger.debug(f"PROFILE_NAME: {self.profile_name}")
def override_object_creation_from_flags(self) -> None: if self.flags.create_table: logger.debug(yellow("going to create table")) self.object_creation_dct.update({"create_table": True}) if self.flags.create_schema: logger.debug(yellow("going to create schema")) self.object_creation_dct.update({"create_schema": True}) logger.debug(yellow(f"Object creation dict after override\n {self.object_creation_dct}")) if self.flags.destructive_create_table: logger.debug(yellow("going to perform destuctive table creation")) self.destructive_create_table = True
def cast_pandas_dtypes( df: pandas.DataFrame, overwrite_dict: dict = dict()) -> pandas.DataFrame: """Converts a dataframe's columns along a provided dictionary of {col: dype}. Args: df (pandas.DataFrame): dataframe to cast. overwrite_dict (dict, optional): Dict of shate {column: dtype}. Defaults to dict(). Raises: UnsupportedDataTypeError: When a dtype isn't currently supported (see dtypes_map inside function). ColumnNotFoundInDataFrame: When a column that is required for casting isn't found. Returns: pandas.DataFrame: df with converted dtypes """ overwrite_dict = overwrite_dict.copy() dtypes_map = dict( varchar="object", # this is intentional in case of nulls. currently pandas doesn't play well with converting mixed types # see https://github.com/bastienboutonnet/sheetwork/issues/204 for more details int="object", numeric="float64", # ! HOT_FIX # this is intentional pandas # see https://github.com/bastienboutonnet/sheetwork/issues/288 boolean="boolean", timestamp_ntz="datetime64[ns]", date= "datetime64[ns]", # this intentional pandas doesn't really have just dates. ) # Check for type support unsupported_dtypes = set(overwrite_dict.values()).difference( dtypes_map.keys()) if unsupported_dtypes: raise UnsupportedDataTypeError( f"{unsupported_dtypes} are currently not supported") # check overwrite col is in df invalid_columns = set(overwrite_dict.keys()).difference( set(df.columns.tolist())) if invalid_columns: raise ColumnNotFoundInDataFrame( f"{invalid_columns} not in DataFrame. Check spelling?") # recode dict in pandas terms for col, data_type in overwrite_dict.items(): overwrite_dict.update({col: dtypes_map[data_type]}) # cast logger.debug(f"DF BEFORE CASTING: {df.head()}") logger.debug(f"DF BEFORE CASTING DTYPES: {df.dtypes}") # handle boolean "manually" because .astype(bool) leads to everythin being true if not null. df = handle_booleans(df, overwrite_dict=overwrite_dict) # use pandas native function for all other data types as they are not problematic and we have # already handled booleans specificatlly. df = df.astype(overwrite_dict) logger.debug(f"Head of cast dataframe:\n {df.head()}") return df
def decide_object_creation(self) -> None: self.handle_deprecations() create_everything_label = "always_create_objects" object_creation_mapping = { # ! DEPRECATE "always_create" "create_table": ["always_create_table", "always_create"], "create_schema": ["always_create_schema"], } for object_type, rule in object_creation_mapping.items(): if self.project_dict.get(create_everything_label): create = [True] else: create = [True for x in rule if self.project_dict.get(x) is True] self.object_creation_dct.update({object_type: True in create}) self.destructive_create_table = ( True if self.project_dict.get("destructive_create_table", self.destructive_create_table) is True else False ) logger.debug(yellow(f"Object creation dict:\n {self.object_creation_dct}")) logger.debug(yellow(str(self.project_dict)))
def check_columns_in_df( df: pandas.DataFrame, columns: Union[List[str], str], warn_only: bool = False, suppress_warning: bool = False, ) -> Tuple[bool, List[str]]: """Checks if a bunch of columns are present in a dataframe. Args: df (pandas.DataFrame): df to check. columns (Union[List[str], str]): column names to check for. warn_only (bool, optional): When True will only warn otherwise raises. Defaults to False. suppress_warning (bool, optional): When true warning isn't shown only return. Defaults to False. Raises: ColumnNotFoundInDataFrame: If warn_only is False, this error will be raised when any of the columns to check for are not present in the dataframe. Returns: Tuple[bool, List[str]]: Boolean if all columns are present in df, List of missing columns. """ if isinstance(columns, str): columns = [columns] is_subset = set(columns).issubset(df.columns) if is_subset: return True, columns # else reduce columms, provide filtered list set bool to false and warn or raise cols_not_in_df = [x for x in columns if x not in df.columns.tolist()] reduced_cols = [x for x in columns if x in df.columns.tolist()] message = f"The following columns were not found in the sheet: {cols_not_in_df} " if warn_only and not suppress_warning: logger.warning( yellow( message + "they were ignored. Consider cleaning your sheets.yml file")) elif not warn_only and not suppress_warning: raise ColumnNotFoundInDataFrame( message + "Google Sheet or sheets.yml needs to be cleaned") return False, reduced_cols
def run(self): # print something cos it's fun! print( r""" ______ __ __ / __/ / ___ ___ / /__ _____ ____/ /__ _\ \/ _ \/ -_) -_) __/ |/|/ / _ \/ __/ '_/ /___/_//_/\__/\__/\__/|__,__/\___/_/ /_/\_\\ """ ) logger.info("Alright let's get to work") logger.info("❤️ Taking peanut butter and jelly out of the cupboard 🍇") time.sleep(3) # do the actual work people cared about in the first place. self.assert_project_name() self.override_paths() self.create_project_dir() self.create_project_file() self.create_profiles_dir() self.create_profiles_file() self.create_google_dir_and_file() self.show_complete()
def check_table(self, target_schema: str, target_table: str) -> None: columns_query = f""" select count(*) from {self._database}.information_schema.columns where table_catalog = '{self._database.upper()}' and table_schema = '{target_schema.upper()}' and table_name = '{target_table.upper()}' ; """ rows_query = rows_query = f"select count(*) from {target_schema}.{target_table}" columns = self.excecute_query(columns_query, return_results=True) rows = self.excecute_query(rows_query, return_results=True) if columns and rows: logger.info( timed_message( green( f"Push successful for " f"{self._database}.{target_schema}.{target_table} \n" f"Found {columns[0][0]} columns and {rows[0][0]} rows." ))) else: raise TableDoesNotExist( f"Table {self._database}.{target_schema}.{target_table} seems empty" )
def __init__(self, flags: FlagParser) -> None: """Constructs project object. Args: flags (FlagParser): Inited flags object. """ self.project_dict: Dict[str, Union[str, bool]] = dict() self.target_schema: str = str() self.object_creation_dct: Dict[str, bool] = dict() self.destructive_create_table: bool = False self.flags = flags # directories (first overwritten by flags, then by project) This may not always be able to # be like this we might wanna give prio to CLI but for now this removes some complication. self.project_file_fullpath: Path = Path("dumpy_path") self.profile_dir: Path = Path("~/.sheetwork/").expanduser() self.sheet_config_dir: Path = Path.cwd() # override defaults self.override_paths_from_flags() self.load_project_from_yaml() self.decide_object_creation() self.override_object_creation_from_flags() logger.debug(f"Project name: {self.project_name}")
def deprecate(message: str, colour: str = "yellow") -> None: """Handles deperecation messages more using proper DeprecationWarnings. It also makes sure deprecatio warnings are enabled globally as certain shells might have them turned off by default. Args: message (str): Deprecation message to print. colour (str, optional): Colour name to wrap the decprecation message. For now only "yellow", "red" or None are supported. Defaults to "yellow". """ global DEPRECATION_WARNINGS_ENABLED, _WARNINGS_ALREADY_ENABLED if colour == "yellow": _message = yellow(message) elif colour == "red": _message = red(message) elif colour is None: _message = message else: logger.error( f"{colour} is not supported, painting error mesage 'yellow'") _message = yellow(colour) if DEPRECATION_WARNINGS_ENABLED and not _WARNINGS_ALREADY_ENABLED: _WARNINGS_ALREADY_ENABLED = True warnings.filterwarnings("default", ".*", category=DeprecationWarning, module="gspread_pandas") if _WARNINGS_ALREADY_ENABLED and not DEPRECATION_WARNINGS_ENABLED: warnings.filterwarnings("ignore", ".*", category=DeprecationWarning, module="gspread_pandas") warnings.warn(_message, DeprecationWarning, stacklevel=2)
def make_df_from_worksheet(self, worksheet_name: str = str(), grab_header: bool = True) -> pandas.DataFrame: if not self.workbook: raise NoWorkbookLoadedError( "Workbook object seems empty, cannot turn a None object into a dataframe" ) try: if worksheet_name: worksheet = self.workbook.worksheet(worksheet_name) else: worksheet_name = "default sheet" worksheet = self.workbook.get_worksheet(0) logger.debug(green("Sheet loaded successfully")) if grab_header: values: List[Any] = worksheet.get_all_values() check_dupe_cols(values[0]) df = pandas.DataFrame(values[1:], columns=values[0]) else: df = pandas.DataFrame(worksheet.get_all_values()) logger.debug(yellow(f"Raw obtained google sheet: \n {df.head()}")) return df except Exception as e: raise SheetLoadingError(f"Error loading sheet: \n {e}")
def upload(self, df: pandas.DataFrame, override_schema: str = str()) -> None: # cast columns # !: note integer conversion doesn't actually happen it is left as a str see #204, #205 df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns) dtypes_dict = self.sqlalchemy_dtypes(self.config.sheet_columns) # potentially override target schema from config. if override_schema: schema = override_schema else: schema = self.config.target_schema # write to csv and try to talk to db temp = tempfile.NamedTemporaryFile() df.to_csv(temp.name, index=False, header=False, sep="|") self.acquire_connection() # set up schema creation self._create_schema() try: # set the table creation behaviour _if_exists = "fail" if self.config.project.object_creation_dct["create_table"] is True: if self.config.project.destructive_create_table: _if_exists = "replace" # perform the create ops try: df.head(0).to_sql( name=self.config.target_table, schema=schema, con=self.con, if_exists=_if_exists, index=False, dtype=dtypes_dict, ) # if _if_exists is fail pandas will throw a ValueError which we want to escape when # destructive_create_table is set to False (or not provided) and throw a warning instead. except ValueError as e: if _if_exists == "fail": logger.warning( yellow( f"{self._database}" f".{schema}.{self.config.target_table} already exists and was not\n" "recreated because 'destructive_create_table' is set to False in your profile \n" "APPENDING instead.")) else: raise DatabaseError(str(e)) # Now push the actual data --the pandas create above is only for creation the logic below # is actually faster as pandas does it row by row qualified_table = ( f"{self._database}.{self.config.target_schema}.{self.config.target_table}" ) self.con.execute(f""" create or replace temporary stage {self.config.target_table}_stg file_format = (type = 'CSV' field_delimiter = '|' skip_header = 0 field_optionally_enclosed_by = '"') """) self.con.execute( f"put file://{temp.name} @{self.config.target_table}_stg") self.con.execute( f"copy into {qualified_table} from @{self.config.target_table}_stg" ) self.con.execute(f"drop stage {self.config.target_table}_stg") except Exception as e: raise DatabaseError(str(e)) finally: logger.debug("CLOSING CONNECTION & CLEANING TMP FILE") temp.close() self.close_connection()
def create_profiles_dir(self): if not self.profiles_path.exists(): make_dir(self.profiles_path) else: logger.debug(f"{self.profiles_path} already exists.")
def load_sheet(self): """Loads a google sheet, and calls clean up steps if applicable. Sheet must have been shared with account admin email address used in storage. Raises: TypeError: When loader does not return results that can be converted into a pandas DataFrame a type error will be raised. """ if self.flags.sheet_name: logger.info(timed_message(f"Importing: {self.flags.sheet_name}")) logger.debug( f"Importing data from: {self.config.sheet_config['sheet_key']}" ) else: logger.info( timed_message( f"Importing data from: {self.config.sheet_config.get('sheet_key')}" )) df = self._obtain_googlesheet() if not isinstance(df, pandas.DataFrame): raise TypeError("import_sheet did not return a pandas DataFrame") logger.debug(f"Columns imported from sheet: {df.columns.tolist()}") # Perform exclusions, renamings and cleanups before releasing the sheet. df = self.exclude_columns(df) df = self.rename_columns(df) self.push_anyway, df = self.run_cleanup(df) logger.debug(f"Columns after cleanups and exclusions: {df.columns}") logger.debug(f"Loaded SHEET HEAD: {df}") self.sheet_df = df
def create_profiles_file(self): profile_file = Path(self.profiles_path, "profiles").with_suffix(".yml") if not profile_file.exists(): make_file(profile_file) else: logger.debug(f"{profile_file} already exists.")