Exemple #1
0
def transform_dataframe(df: pd.DataFrame) -> List[dict]:
    """Performs data transformations, arranging it as nested JSON to be loaded into document-oriented database.

    Args:
        df: dataframe to be transformed

    Returns:
        List of dictionaries, as a JSON format
    """

    # TODO: Challenge: handling bad characters (tried different encodings to no avail)
    #   Example: df.Name[39644]: tom\u00e1s Ram\u00edrez
    #   Similarly with column `FirstAppearance`, seems to have corrupted records

    # Dropping column with bad/corrupted records
    df = df.drop(columns=['FirstAppearance'])

    # Replace Empty/Null with 0
    df.Appearances = df.Appearances.fillna(0)

    # > Good idea to keep track of when the data was migrated
    execution_date = str(dt.date.today())

    # Arrange dataframe into nested JSON structure, grouping similar data together

    #   > Fast, list comprehension approach, but not so readable. S.O: https://stackoverflow.com/a/55557758
    documents = [{
        'id': row[0],
        'name': row[1],
        'attributes': {
            'identity': row[2],
            'alignment': row[3],
            'status': row[4],
        },
        'physicalAttributes': {
            'eyeColor': row[5],
            'hairColor': row[6],
            'gender': row[7]
        },
        'appearancesCount': row[8],
        'year': row[9],
        'universe': row[10],
        'dateMigrated': execution_date
    } for row in zip(df.ID, df.Name, df.Identity, df.Alignment, df.Status,
                     df.EyeColor, df.HairColor, df.Gender, df.Appearances,
                     df.Year, df.Universe)]

    return documents