Exemple #1
0
def preprocess(df: DataFrame) -> None:
    """Add extra attributes for tracking clusters and distances"""
    zeroes = np.zeros(len(df))

    df = df.assign(_cluster=Series(zeroes))
    df = df.assign(_distance=Series(zeroes))

    # Remove columns that we don't cluster on
    # This is built for the wine dataset and the TwoDimHard.
    # It'd be nice if this was more intelligent, but alas.
    for column in IGNORED_COLUMNS:
        if column in df:
            df = df.drop(column, 1)

    # Normalize non-ID columns
    ids = []
    for column in df.columns:
        if column != 'ID':
            ids.append(column)

    df[ids] = df[ids].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return df
Exemple #2
0
def extract_catalog_query(catalog_df: DataFrame, title_col: str,
                          desc_col: str) -> str:
    """
    Given a DataFrame of catalog data:
    - combines the title and description columns
    - combines all of the combined titles and descriptions into a single string

    This string will be used as the query when matching against workshops.
    """
    df_combined = catalog_df.assign(title_desc=catalog_df[f"{title_col}"] +
                                    catalog_df[f"{desc_col}"])

    # combined = df_combined["title_desc"].to_list()
    # return " ".join(
    #     [strip_html_tags(content) for content in combined if type(content) == str]
    # )

    # For now, just return a string of combined titles
    return " ".join(catalog_df[f"{title_col}"])
Exemple #3
0
def build_full_dataset(base_dataset: DataFrame, segment_ids: list, labels: list) -> DataFrame:

    df_with_segmentIds = base_dataset.assign(segment_id=segment_ids)
    return df_with_segmentIds.assign(label=labels)