Esempio n. 1
0
def render_eda():
    st.title("Create a Complete Report of your data.")
    st.subheader("Exploratory Data Analysis using pandas profiling.")
    st.write("""All you need to do is upload a dataset and get a quick
            sense of your data.""")
    data = st.file_uploader("Upload Dataset", type=["csv", "txt"])
    if data is not None:
        df = pd.read_csv(data)
        # to adjust profile report check this link
        # https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/index.html
        # use --> (minimal=True) setting for large datasets
        pr = ProfileReport(df, explorative=True)
        st.title("Pandas Profiling Report in Streamlit")
        st.write(df)
        st_profile_report(pr)
        # save report
        pr.to_file("Output.html")
        st.write("Your report has been saved!")
def run_training(experiment_name,
                 data_profiling) -> None:
    """Train the model."""
    _logger.info(f'Working on: {os.getcwd()}')
    if experiment_name == False:
        experiment_name = datetime.now().strftime("model_experiment_%Y%m%d_%H%M%S")
        _logger.info(
            "The run training name was fixed in {}".format(experiment_name))

    # read training data
    data = load_dataset(file_name=config.TRAINING_DATA_FILE)

    _logger.info("The dataset contains {} rows and {} columns".format(
        data.shape[0], data.shape[1]))
    _logger.info("Dataset info: \n{}". format(
        data.describe(percentiles=[], include="all").T.to_string()))

    model_subfloder = config.TRAINED_MODEL_DIR/experiment_name
    _logger.info("Creating model folder in {}".format(str(model_subfloder)))

    if data_profiling:
        _logger.info("Creating a data report for data training")
        profile = ProfileReport(data, title=experiment_name, explorative=True)
        profile.to_file(config.REPORT_DIR / "data_train_report.html")
        _logger.info("A report in html was saved in {}".format(config.REPORT_DIR))

    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data[config.FEATURES],
        data[config.TARGET],
        test_size=0.1,
        random_state=0)  # we are setting the seed here

    # transform the target
    y_train = np.log(y_train)
    y_test = np.log(y_test)

    pipeline.price_pipe.fit(X_train[config.FEATURES],
                            y_train)

    _logger.info(f'saving model version: {_version}')
    save_pipeline(pipeline_to_persist=pipeline.price_pipe)
    
    _logger.info(f'Logs saved on: {config.LOG_DIR}')
Esempio n. 3
0
def test_modular_present(tdf):
    profile = ProfileReport(
        tdf,
        title="Modular test",
        duplicates={"head": 10},
        samples={
            "head": 10,
            "tail": 10
        },
        interactions={
            "targets": ["mass (g)"],
            "continuous": True
        },
        correlations={
            "pearson": {
                "calculate": True
            },
            "spearman": {
                "calculate": True
            },
            "kendall": {
                "calculate": True
            },
            "phi_k": {
                "calculate": True
            },
            "cramers": {
                "calculate": True
            },
        },
        missing_diagrams={
            "matrix": True,
            "bar": True,
            "dendrogram": True,
            "heatmap": True,
        },
        pool_size=1,
    )

    html = profile.to_html()
    assert "Correlations</h1>" in html
    assert "Duplicate rows</h1>" in html
    assert "Sample</h1>" in html
    assert "Missing values</h1>" in html
Esempio n. 4
0
async def inspect_data(request: web.Request):
    filename = request.match_info['file']
    project = request.match_info['project']
    df = pd.read_csv(request.app['settings'].PROJECT_DIR + "/" + project +
                     "/files/" + filename)

    profile = ProfileReport(df,
                            title='Pandas Profiling Report',
                            html={'style': {
                                'full_width': True
                            }})
    path = request.app[
        'settings'].PROJECT_DIR + "/" + project + "/files/" + filename.replace(
            ".csv", ".html")
    profile.to_file(output_file=path)
    with open(path, "r", encoding='utf-8') as f:
        text = f.read()
        print("responding now")
        return web.Response(text=text, content_type='text/html')
Esempio n. 5
0
def test_example(get_data_file, test_output_dir):
    file_name = get_data_file(
        "meteorites.csv",
        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
    )

    # For reproducibility
    np.random.seed(7331)

    df = pd.read_csv(file_name)

    # Note: Pandas does not support dates before 1880, so we ignore these for this analysis
    df["year"] = pd.to_datetime(df["year"], errors="coerce")

    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Highly correlated variables
    df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df)))

    # Example: Duplicate observations
    duplicates_to_add = pd.DataFrame(df.iloc[0:10])
    duplicates_to_add["name"] += " copy"

    df = df.append(duplicates_to_add, ignore_index=True)

    output_file = test_output_dir / "profile.html"
    profile = ProfileReport(
        df, title="NASA Meteorites", samples={"head": 5, "tail": 5}, minimal=True
    )
    profile.to_file(output_file=output_file)
    assert (test_output_dir / "profile.html").exists(), "Output file does not exist"
    assert (
        type(profile.get_description()) == dict
        and len(profile.get_description().items()) == 7
    ), "Unexpected result"
    assert "<span class=badge>10</span>" in profile.to_html()
Esempio n. 6
0
    def report(
        self,
        dataset: str = "dataset",
        n_rows: Optional[Union[int, float]] = None,  # float for 1e3...
        filename: Optional[str] = None,
    ):
        """Create an extensive profile analysis report of the data.

        The profile report is rendered in HTML5 and CSS3. Note that
        this method can be slow for n_rows>10k.

        Parameters
        ----------
        dataset: str, optional (default="dataset")
            Data set to get the report from.

        n_rows: int or None, optional (default=None)
            Number of (randomly picked) rows in to process. None for
            all rows.

        filename: str or None, optional (default=None)
            Name to save the file with (as .html). None to not save
            anything.

        Returns
        -------
        profile: ProfileReport
            Created report object.

        """
        self.log("Creating profile report...", 1)

        n_rows = getattr(self,
                         dataset).shape[0] if n_rows is None else int(n_rows)
        profile = ProfileReport(getattr(self, dataset).sample(n_rows))

        if filename:
            if not filename.endswith(".html"):
                filename = filename + ".html"
            profile.to_file(filename)
            self.log("Report saved successfully!", 1)

        return profile
Esempio n. 7
0
def test_modular_absent(tdf):
    profile = ProfileReport(
        tdf,
        title="Modular test",
        duplicates={"head": 0},
        samples={
            "head": 0,
            "tail": 0
        },
        interactions=None,
        correlations=None,
        missing_diagrams=None,
    )

    html = profile.to_html()
    assert "Correlations</h1>" not in html
    assert "Duplicate rows</h1>" not in html
    assert "Sample</h1>" not in html
    assert "Missing values</h1>" not in html
def test_html_export_png(test_output_dir):
    n_rows = 10
    n_columns = 10

    df = pd.DataFrame(
        np.random.randint(0, 1000, size=(n_rows, n_columns)),
        columns=[f"column_{c}" for c in range(n_columns)],
    )

    profile = ProfileReport(df,
                            minimal=True,
                            html={"inline": False},
                            plot={"image_format": "png"})

    report = test_output_dir / "export_png.html"
    profile.to_file(report)
    assert report.exists()
    assets_dir = test_output_dir / "export_png_assets"
    check_assets(assets_dir, "png", n_css=3, n_js=3)
Esempio n. 9
0
def test_issue_120(get_data_file):
    file_name = get_data_file(
        "pandas_profiling_bug.txt",
        "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt",
    )
    df = pd.read_csv(file_name)

    report = ProfileReport(
        df,
        correlations={"cramers": {
            "calculate": False
        }},
        vars={"cat": {
            "check_composition": True
        }},
    )
    html = report.to_html()
    assert type(html) == str
    assert "<p class=h4>Dataset statistics</p>" in html
Esempio n. 10
0
def test_issue_120(get_data_file):
    file_name = get_data_file(
        "pandas_profiling_bug.txt",
        "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt",
    )
    df = pd.read_csv(file_name)

    report = ProfileReport(
        df,
        correlations=None,
        progress_bar=False,
        pool_size=1,
        vars={"cat": {
            "words": True,
            "characters": True
        }},
    )
    _ = report.report
    assert report.description_set is not None
def test_subdir(test_output_dir):
    n_rows = 10
    n_columns = 10

    df = pd.DataFrame(
        np.random.randint(0, 1000, size=(n_rows, n_columns)),
        columns=[f"column_{c}" for c in range(n_columns)],
    )

    profile = ProfileReport(df, minimal=True, html={"inline": False})

    subdir_path = test_output_dir / "subdir"
    subdir_path.mkdir()
    subdir_path.joinpath("test.py").touch()
    report = subdir_path / "subdir.html"
    profile.to_file(report)
    assert report.exists()
    assets_dir = subdir_path / "subdir_assets"
    check_assets(assets_dir, "svg", n_css=3, n_js=3)
    assert subdir_path.joinpath("test.py").exists()
Esempio n. 12
0
def main(dir_main, make_profile=False):
    dir_main = Path(dir_main)
    path_true_news = Path(dir_main / CORPUS_DIR / TRUE_CSV)
    path_fake_news = Path(dir_main / CORPUS_DIR / FAKE_CSV)
    path_profile = Path(dir_main / PROFILE_REPORT_HTML)

    # load and format data
    df_all = import_data(path_true_news, path_fake_news)

    if make_profile:
        from pandas_profiling import ProfileReport  # takes forever
        prof = ProfileReport(df_all)
        prof.to_file(output_file=path_profile)

    # vectorize title only
    df_all = vectorize_content(df_all, label_col='label', text_col='title')

    df_all.describe()

    pass
Esempio n. 13
0
def test_interactions_target():
    n_rows = 10
    n_columns = 50
    n_targets = 2

    df = pd.DataFrame(
        np.random.randint(0, 1000, size=(n_rows, n_columns)),
        columns=[f"column_{c}" for c in range(n_columns)],
    )
    targets = [f"column_{target}" for target in range(0, n_targets)]

    profile = ProfileReport(df,
                            minimal=True,
                            interactions={
                                "continuous": True,
                                "targets": targets
                            })

    total = sum(
        len(v.keys()) for k, v in profile.get_description()["scatter"].items())
    assert total == n_targets * n_columns
Esempio n. 14
0
def test_issue523():
    # https://github.com/pandas-dev/pandas/issues/33803

    data = [
        1871248,
        12522551,
        1489260,
        6657093,
        pd.NA,
        pd.NA,
        pd.NA,
        pd.NA,
        pd.NA,
        1489260,
        pd.NA,
        2468576,
    ]
    df = pd.DataFrame({"col": data}, dtype=pd.Int64Dtype())

    profile_report = ProfileReport(df, title="Test Report", progress_bar=False)
    assert len(profile_report.get_description()) > 0
def profiling_report(df, minimal_mode=False, dark_mode=True):
    """
    Utiliza la libreria pandas_profiling para hacer una exploración visual
    rápida de los datos
    Parameters
    ----------
    df : dataframe
        dataframe with data to analyse.
    minimal_mode : string, optional
        En el caso de que sea True, hace cálculo de correlaciones no lineales.
        The default is False.
    dark_mode : string, optional
        si es en el modo oscuro o no. The default is True.
    Returns
    -------
    .html con la exploración de los datos.
    """

    # esto hace la logica de como guardar el archivo nomás
    if dark_mode:
        type_html = "-black"
    else:
        type_html = ""
    if minimal_mode:
        title_mode = "no expensive computations"
        mode = title_mode.replace(" ", "-")
    else:
        title_mode = ""
        mode = title_mode.replace(" ", "-")

    title = "Exploratory Data Analysis: Floating Data"
    prof = ProfileReport(df,
                         title=title,
                         explorative=False,
                         minimal=minimal_mode,
                         orange_mode=dark_mode)
    # guardar el html
    path_output =\
        f'results/exploratory-analysis/{mode}-eda.html'
    prof.to_file(output_file=path_output)
Esempio n. 16
0
def test_modular_description_set(tdf):
    profile = ProfileReport(
        tdf,
        title="Modular test",
        duplicates=None,
        samples={
            "head": 0,
            "tail": 0
        },
        correlations=None,
        interactions=None,
        missing_diagrams={
            "matrix": False,
            "bar": False,
            "dendrogram": False,
            "heatmap": False,
        },
        pool_size=1,
    )

    html = profile.get_description()
    assert len(html) > 0
Esempio n. 17
0
def test_issue_120(get_data_file):
    file_name = get_data_file(
        "pandas_profiling_bug.txt",
        "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt",
    )
    df = pd.read_csv(file_name)

    report = ProfileReport(
        df,
        correlations={
            "cramers": {
                "calculate": False
            },
            "phi_k": {
                "calculate": False
            },
            "kendall": {
                "calculate": False
            },
            "spearman": {
                "calculate": False
            },
            "pearson": {
                "calculate": False
            },
            "recoded": {
                "calculate": False
            },
        },
        progress_bar=False,
        pool_size=0,
        vars={"cat": {
            "words": True,
            "characters": True
        }},
    )
    _ = report.report
    assert report.description_set is not None
def test_html_export_cdn(test_output_dir):
    n_rows = 10
    n_columns = 10

    df = pd.DataFrame(
        np.random.randint(0, 1000, size=(n_rows, n_columns)),
        columns=[f"column_{c}" for c in range(n_columns)],
    )

    profile = ProfileReport(
        df,
        minimal=True,
        html={
            "inline": False,
            "use_local_assets": False
        },
    )

    report = test_output_dir / "cdn.html"
    profile.to_file(report)
    assert report.exists()
    assets_dir = test_output_dir / "cdn_assets"
    check_assets(assets_dir, "svg", n_css=1, n_js=1)
Esempio n. 19
0
def print_basic_details(df, file_name, report=False, open_html=False):
    from pandas_profiling import ProfileReport
    raw_num, col_num, col_list_names, duplicated_raws, col_types = get_basic_details(
        df)
    print('''
The number of raws in {0} DF are: {1}
The number of columns in {0} DF are: {2}
The names of columns in {0} DF are: \n {3}
The count of duplicated raws in {0} DF is: {4}
Columns types in {0} DF are: \n {5}
						'''.format(file_name, raw_num, col_num, col_list_names, duplicated_raws,
                 col_types))

    if report:
        profile = ProfileReport(
            df,
            title='Pandas Profiling Report - {} Data Frame'.format(file_name),
            html={'style': {
                'full_width': True
            }})
        profile.to_file(output_file="{}.html".format(file_name))
        if open_html:
            import webbrowser
            webbrowser.open('{}.html'.format(file_name))
Esempio n. 20
0
def get_profile_results(data):
    """profiles pandas dataframe"""

    if isinstance(data, pd.DataFrame):
        profile = ProfileReport(
          data,
          title='Snowflake Data Profiler from Hashmap',
          progress_bar=False,
          explorative=True,
          correlations={
             "pearson": {"calculate": True},
             "spearman": {"calculate": False},
             "kendall": {"calculate": False},
             "phi_k": {"calculate": False},
             "cramers": {"calculate": False},
         },
        )

        p = profile.to_html() # this step sometimes fails with matplotlib errors about threads. I've only fixed it by adjusting requirements.txt in the past. I've just specified the specific versions of libraries. Pyarrow seems to have an impact on this.

        return p

    else:
        raise TypeError('This is not a pandas dataframe.')
Esempio n. 21
0
    def profile(self,
                title: str = 'Dataset profile report',
                html_path: str = None,
                show_report_in_notebook: bool = False):
        """Generates a pandas-profiling report of the dataset to be displayed in a jupyter notebook.
        Optionally saves the report as an html file

        :param html_path: If provided, the pandas-profiling report will be saved to disk
        :param show_report_in_notebook: Whether or not to show report in jupyter notebook
        :return: None
        """
        if not os.path.exists(html_path):
            logger.info('Generating the profiling report')
            profile_report = ProfileReport(self.data, title=title)
            if html_path is not None:
                profile_report.to_file(html_path)
                logger.info(
                    f'Saved the pandas-profiling report to ``{html_path}``')
            profile_report.to_notebook_iframe()
        else:
            logger.info(
                f'A profiling report was already generated and will be loaded from ``{html_path}``'
            )
            display(IFrame(src=html_path, width=10**3, height=10**3))
Esempio n. 22
0
def test_issue864():
    def random_list(n):
        return [random.randrange(0, 100) for _ in range(0, n)]

    df = pd.DataFrame({"a": random_list(30)})

    profile = ProfileReport(df)

    def test_with_value(n_extreme_obs):
        """Generate HTML and validate the tabs contain the proper tab titles."""
        profile.config.n_extreme_obs = n_extreme_obs
        profile.invalidate_cache()

        reg_min = f"*<a href=* aria-controls=* role=tab data-toggle=tab>Minimum {n_extreme_obs} values</a>*"
        reg_max = f"*<a href=* aria-controls=* role=tab data-toggle=tab>Maximum {n_extreme_obs} values</a>*"

        html = profile.to_html()

        assert fnmatch.fnmatch(html, reg_min)
        assert fnmatch.fnmatch(html, reg_max)

    test_with_value(5)
    test_with_value(10)
    test_with_value(12)
Esempio n. 23
0
def profiling_page():
    logger.info({"message": "Loading profiling page."})
    st.title("Profiling Tables")

    # Select table
    db = Database(file_name=st.session_state.db_name)
    db_tables = db.show_tables()

    if len(db_tables) == 0:
        st.warning("The database has no tables available.")
        logger.warning({"message": "The database has no tables available."})
        st.stop()

    st.write(
        "You can select an entire table or create your custom SQL-statement.")

    with st.form(key="profiling_form"):
        query = st.text_area("SQL-statement",
                             value="SELECT * FROM table",
                             height=300,
                             help="SQL-statement based on SQLite syntax.")

        st.write(' ')

        if st.form_submit_button(label='Profiling'):
            logger.info({"message": "Profiling Table."})

            df_query = db.query(query)
        else:
            df_query = None

    if df_query is not None:
        pr = ProfileReport(df_query, explorative=True, dark_mode=True)
        st_profile_report(pr)

    logger.info({"message": "Profiling page loaded."})
Esempio n. 24
0
""" Daten ablesen und für die Modellbildung vorbereiten
Die Daten aus den vorgegebenen .txt* Datei ablesen, nicht aufgefassennen Daten werden als NaN ergänzt
Die wichtigen Parameter heraussuchen, möglich sind ("Gear","amb","HCnt"...)
Eine Analyse-report durch Pandas_profiling herstellen, umd einen Überblick der Daten herzustellen
Die Daten visualisieren, inkulusiv zeitliche Verlauf, linearregrassion je eines Zeitintervall (90 Tagen hier)
"""
import datetime
import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ml_dataprocessing_txttocsv import ml_data_processing_txt2cav
from visualisation import interval_reg, normal_draw
from pandas_profiling import ProfileReport
# ob man Daten konvertieren muss (1: ja, 0: nein)
data_convert = 1
# Speicherort der Daten auswaehlen
path_data = 'E:\\0000_Daten\\0000_Daten'
# Die Zeichnung direkt abspeichern ohne vorzustellen
matplotlib.use('agg')
# Die Daten konvertieren zu csv Datei
if data_convert == 1:
    ml_data_processing_txt2cav(path_data)
# starten
folder = ['Workpart1_WTG', 'Workpart2_WTG']
for folder_name in folder:
    file_path = 'E:\\csv_data_group_8\\' + folder_name
    print('working on ' + folder_name)
    file_name = os.listdir(file_path)
Esempio n. 25
0
for i in df.quality.iteritems():
    value = (i[1])
    if value >= 6.5:
        quality_bool.append(1)
    else:
        quality_bool.append(0)

df['quality_bool'] = quality_bool  # Putting the Column in Data Frame
df.quality_bool.value_counts()  # Understanding the comportament of my data

#Importing ProfileReport to help on Descriptive Analisys
from pandas_profiling import ProfileReport

profile = ProfileReport(df,
                        title='Relatory of Red Wine Quality',
                        html={'style': {
                            'full_width': True
                        }})
profile
profile.to_file(output_file="redwine_quality.html")  #Dowloading the relatory

# Cleaning Data
df.duplicated()  # Finding duplicated rows
df_without_duplicates = df.drop_duplicates(
)  # Creating othes Data Frame withour duplicated rows

# Comparing Data Frames
df.info()
df_without_duplicates.info()
df_without_duplicates.mean()
df_without_duplicates = df_without_duplicates.rename(
#adding the column t and t_squared in the data
air['t'] = t
air['t_square'] = air['t'] * air['t']
#performing the log operation on the Passenger data and then adding in to in the main data .
log_pass = np.log(air['Passengers'])
air['log_pass'] = log_pass

# In[7]:

air

# In[8]:

#EDA
report = ProfileReport(air,
                       title="Profile Report of the Airlines data",
                       explorative=True)

# In[29]:

report.to_widgets()

# In[ ]:

#REPORT SAYS THAT:
#1) THERE ARE NO MISSING VALUES
#2) NO DUPLICATE ROWS
#3) NO MULTICOLINEARITY

# In[9]:
Esempio n. 27
0
from statsmodels.nonparametric.smoothers_lowess import lowess
import phik

from helpers import encode_dates, loguniform, similarity_encode

df = pd.read_csv(
    r"data\appstore_games.csv",
    parse_dates=["Original Release Date", "Current Version Release Date"],
    index_col=[],
    delimiter=",",
    low_memory=False,
)

PROFILE = False
if PROFILE:
    profile = ProfileReport(df)
    profile.to_file("pandas_profiling_report.html")

print(
    pd.concat([df.dtypes, df.nunique() / len(df)],
              axis=1).rename({
                  0: "dtype",
                  1: "proportion unique"
              }, axis=1).sort_values(["dtype", "proportion unique"]))

TARGET = "Average User Rating"
print(f"Missing targets: {df[TARGET].isnull().sum()}")
print(f"% missing: {df[TARGET].isnull().sum() / len(df):.0%}")

DROP_MISSING = False
if DROP_MISSING:
    baysis_selected.drop('Bes2', axis='columns', inplace=True)

    ###############
    ### Scatter ###
    ###############

    ###########
    ### Box ###
    ###########

    ##############
    ### Report ###
    ##############

    if generate_report:
        report = ProfileReport(baysis_selected,
                               title='BAYSIS Selected Dataset Report')
        report.to_file(work_path + file_prefix + '_report.html')

    ###################
    ### Encoding ###
    ###################

    # define column types
    nominal_columns = [
        "Str", "Kat", "Typ", "UArt1", "UArt2", "AUrs1", "AUrs2", "AufHi",
        "Char1", "Char2", "Bes1", "Bes2", "Lich1", "Lich2", "Zust1", "Zust2",
        "WoTag", 'Month'
    ]
    dichotomous_columns = ["Alkoh"]
    ordinal_columns = ["Betei", "Fstf", "FeiTag"]
Esempio n. 29
0
    uploaded_file = st.sidebar.file_uploader("Upload your input CSV file",
                                             type=["csv"])
    # st.sidebar.markdown("""
    # [Example CSV input file](https://raw.githubusercontent.com/NewHarmony/100_Days_of_Code/data/master/Data_Science/Streamlit/percent_bachelors_degrees_women_usa.csv)
    # """)

#Pandas Profiling Report
if uploaded_file is not None:

    @st.cache
    def load_csv():
        csv = pd.read_csv(uploaded_file)
        return csv

    df = load_csv()
    pr = ProfileReport(df, explorative=True)
    st.header('**Input DataFrame**')
    st.write(df)
    st.write('--')
    st.header('**Pandas Profiling Report**')
    st_profile_report(pr)
else:
    st.info('Waiting for CSV file to be uploaded.')
    if st.button('Click to use Example Dataset'):
        #Example data
        @st.cache
        def load_data():
            csv = pd.read_csv("percent_bachelors_degrees_women_usa.csv")
            return csv

        df = load_data()
Esempio n. 30
0
def get_data(autophrase_params, data_in, false_positive_phrases,
             false_positive_substrings):
    # Make data directories
    os.makedirs('data/temp', exist_ok=True)
    os.makedirs('data/out', exist_ok=True)

    # Read in raw data
    def normalize_languages(x):
        def is_utf8(value):
            try:
                value.encode()
            except UnicodeEncodeError:
                return False
            return True

        def sub(value):
            return re.sub(r' [Ll]anguages?', '', value)

        return list(
            np.unique(
                [sub(value) for value in eval(x).values() if is_utf8(value)]))

    def normalize_countries(x):
        return sorted(eval(x).values())

    def normalize_genres(x):
        def sub(value):
            # Replace with a more common genre name
            if value == 'Animal Picture':
                return 'Animals'
            if value in ['Biographical film', 'Biopic [feature]']:
                return 'Biography'
            if value == 'Buddy Picture':
                return 'Buddy'
            if value == 'Comdedy':
                return 'Comedy'
            if value == 'Coming of age':
                return 'Coming-of-age'
            if value == 'Detective fiction':
                return 'Detective'
            if value == 'Education':
                return 'Educational'
            if value in ['Gay Interest', 'Gay Themed']:
                return 'Gay'
            if value == 'Gross out':
                return 'Gross-out'
            if value == 'Pornography':
                return 'Pornographic'
            if value == 'Social issues':
                return 'Social problem'
            return re.sub(' [Ff]ilms?| [Mm]ovies?', '', value)

        return list(np.unique([sub(value) for value in eval(x).values()]))

    def clean_summary(summary):
        return (summary.str.replace(r'{{.*?}}', '')  # Remove Wikipedia tags
                .str.replace(r'http\S+', '')  # Remove URLs
                .str.replace(r'\s+', ' ')  # Combine whitespace
                .str.strip()  # Strip whitespace
                .replace('', pd.NA)  # Replace empty strings with NA
                )

    movies = pd.read_csv(
        f'{data_in}/movie.metadata.tsv',
        converters={
            'languages': normalize_languages,
            'countries': normalize_countries,
            'genres': normalize_genres
        },
        delimiter='\t',
        header=None,
        index_col='id',
        names='id name date revenue runtime languages countries genres'.split(
        ),
        usecols=[
            0, 2, 3, 4, 5, 6, 7, 8
        ]).assign(date=lambda x: pd.to_datetime(x.date, errors='coerce'))

    summaries = pd.read_csv(
        f'{data_in}/plot_summaries.txt',
        delimiter='\t',
        header=None,
        index_col='id',
        names='id summary'.split()).assign(
            summary=lambda x: clean_summary(x.summary)).dropna()

    # Combine movie metadata and plot summaries into df
    df = movies.merge(summaries,
                      on='id').sort_values('date').reset_index(drop=True)

    # Run AutoPhrase on plot summaries
    with open('data/temp/summaries.txt', 'w') as f:
        f.write('\n'.join(df.summary))

    autophrase_params = ' '.join(
        [f'{param}={value}' for param, value in autophrase_params.items()])
    os.system(
        f'cd AutoPhrase && {autophrase_params} ./auto_phrase.sh && {autophrase_params} ./phrasal_segmentation.sh'
    )

    # Add phrases to df
    def extract_highlighted_phrases(segmentation):
        def is_false_positive(s):
            s = s.lower()
            if len(s) == 1:  # Only 1 character
                return True
            if s in false_positive_phrases:
                return True
            for false_positive_substring in false_positive_substrings:
                if false_positive_substring in s:
                    return True
            return False

        return (segmentation.str.findall(r'<phrase>(.+?)</phrase>').apply(
            lambda x: [s.lower() for s in x
                       if not is_false_positive(s)]).apply(
                           np.unique).apply(list).values)

    df['phrases'] = extract_highlighted_phrases(
        pd.read_csv('model/autophrase/segmentation.txt',
                    delimiter=r'\n',
                    engine='python',
                    header=None,
                    squeeze=True))

    # Export df
    df.to_pickle('data/out/data.pkl')
    ProfileReport(df).to_file('data/out/report.html')