class Connector: """ETL connector class""" def __init__(self): self.data_dir = "data" self.sql = MSSQL() self.ftp = FTP(self.data_dir) def sync_all_ftp_data(self): for table_name, directory_name in data_reports.items(): self.ftp.download_files(directory_name) self._load_new_records_into_table(table_name, directory_name) def _load_new_records_into_table(self, table_name, report_name): """Find and insert new records into the data warehouse.""" start_date = self._get_latest_date(table_name) + timedelta(days=1) yesterday = datetime.today() - timedelta(days=1) if start_date > yesterday: logging.info( f"Clever_{table_name} is up to date. No records inserted.") return else: file_names = self._generate_file_names(start_date, yesterday, report_name) df = self._read_and_concat_files(file_names) self.sql.insert_into(f"Clever_{table_name}", df, if_exists="append") logging.info( f"Inserted {len(df)} records into Clever_{table_name}.") def _get_latest_date(self, table_name): """Get the latest date record in this table.""" date = self.sql.query( f"SELECT TOP(1) [date] FROM custom.Clever_{table_name} ORDER BY [date] DESC" ) latest_date = date["date"][0] return datetime.strptime(latest_date, "%Y-%m-%d") def _generate_file_names(self, start_date, yesterday, report_name): file_names = [] while start_date <= yesterday: # loop through yesterday's date formatted_date = start_date.strftime("%Y-%m-%d") file_names.append(f"{formatted_date}-{report_name}-students.csv") start_date += timedelta(days=1) return file_names def _read_and_concat_files(self, file_names): dfs = [] for file_name in file_names: df = pd.read_csv(f"{self.data_dir}/{file_name}") logging.info(f"Read {len(df)} records from '{file_name}'.") dfs.append(df) data = pd.concat(dfs) return data def sync_student_google_accounts(self): """Get student emails from Google Accounts Manager app.""" browser = Browser(self.data_dir) browser.export_student_google_accounts() # Transform and load csv data into database table df = self._get_data_from_csv_by_name("Student_export") df.rename(columns={"ID": "SIS_ID"}, inplace=True) self.sql.insert_into("Clever_StudentGoogleAccounts", df, if_exists="replace") logging.info( f"Inserted {len(df)} new records into Clever_StudentGoogleAccounts." ) def _get_data_from_csv_by_name(self, string_to_match): """Get the downloaded csv BY NAME and store it in a dataframe.""" for filename in os.listdir(self.data_dir): if fnmatch(filename, f"*{string_to_match}*"): file_path = f"{self.data_dir}/{filename}" break df = pd.read_csv(file_path) logging.info(f"Loaded {len(df)} records from downloaded file.") return df
class Connector: """ETL connector class""" def __init__(self): self.data_dir = "data" self.sql = MSSQL() self.ftp = FTP(self.data_dir) def sync_all_ftp_data(self): for table_name, directory_name in data_reports.items(): self.ftp.download_files(directory_name) self._load_new_records_into_table(table_name, directory_name) def _load_new_records_into_table(self, table_name, report_name): """Find and insert new records into the data warehouse.""" if report_name == "idm-reports": # this folder contains student emails file, which has no datestamp in the file name self._process_files_without_datestamp(table_name, report_name) else: self._process_files_with_datestamp(table_name, report_name) def _process_files_without_datestamp(self, table_name, report_name): # Student Emails file doesn't contain a datestamp in the file name # This table should be truncated and replaced. df = self._read_file(f"{self.data_dir}/google-student-emails.csv") self.sql.insert_into(f"Clever_{table_name}", df, if_exists="replace") logging.info(f"Inserted {len(df)} records into Clever_{table_name}.") def _process_files_with_datestamp(self, table_name, report_name): # Generate names for files with datestamps in the file name and process those files # These tables should be appended to, not truncated. start_date = self._get_latest_date(table_name) + timedelta(days=1) yesterday = datetime.today() - timedelta(days=1) if start_date > yesterday: logging.info( f"Clever_{table_name} is up to date. No records inserted.") return else: file_names = self._generate_file_names(start_date, yesterday, report_name) df = self._read_and_concat_files(file_names) self.sql.insert_into(f"Clever_{table_name}", df, if_exists="append") logging.info( f"Inserted {len(df)} records into Clever_{table_name}.") def _get_latest_date(self, table_name): """Get the latest date record in this table.""" date = self.sql.query( f"SELECT TOP(1) [date] FROM custom.Clever_{table_name} ORDER BY [date] DESC" ) latest_date = date["date"][0] return datetime.strptime(latest_date, "%Y-%m-%d") def _generate_file_names(self, start_date, yesterday, report_name): file_names = [] while start_date <= yesterday: # loop through yesterday's date formatted_date = start_date.strftime("%Y-%m-%d") file_names.append(f"{formatted_date}-{report_name}-students.csv") start_date += timedelta(days=1) return file_names def _read_and_concat_files(self, file_names): dfs = [] for file_name in file_names: df = pd.read_csv(f"{self.data_dir}/{file_name}") logging.info(f"Read {len(df)} records from '{file_name}'.") dfs.append(df) data = pd.concat(dfs) return data def _read_file(self, file_name): df = pd.read_csv(file_name) logging.info(f"Read {len(df)} records from '{file_name}'.") return df