def transform_dataframe(df: pd.DataFrame) -> List[dict]: """Performs data transformations, arranging it as nested JSON to be loaded into document-oriented database. Args: df: dataframe to be transformed Returns: List of dictionaries, as a JSON format """ # TODO: Challenge: handling bad characters (tried different encodings to no avail) # Example: df.Name[39644]: tom\u00e1s Ram\u00edrez # Similarly with column `FirstAppearance`, seems to have corrupted records # Dropping column with bad/corrupted records df = df.drop(columns=['FirstAppearance']) # Replace Empty/Null with 0 df.Appearances = df.Appearances.fillna(0) # > Good idea to keep track of when the data was migrated execution_date = str(dt.date.today()) # Arrange dataframe into nested JSON structure, grouping similar data together # > Fast, list comprehension approach, but not so readable. S.O: https://stackoverflow.com/a/55557758 documents = [{ 'id': row[0], 'name': row[1], 'attributes': { 'identity': row[2], 'alignment': row[3], 'status': row[4], }, 'physicalAttributes': { 'eyeColor': row[5], 'hairColor': row[6], 'gender': row[7] }, 'appearancesCount': row[8], 'year': row[9], 'universe': row[10], 'dateMigrated': execution_date } for row in zip(df.ID, df.Name, df.Identity, df.Alignment, df.Status, df.EyeColor, df.HairColor, df.Gender, df.Appearances, df.Year, df.Universe)] return documents