def preprocess(df: DataFrame) -> None: """Add extra attributes for tracking clusters and distances""" zeroes = np.zeros(len(df)) df = df.assign(_cluster=Series(zeroes)) df = df.assign(_distance=Series(zeroes)) # Remove columns that we don't cluster on # This is built for the wine dataset and the TwoDimHard. # It'd be nice if this was more intelligent, but alas. for column in IGNORED_COLUMNS: if column in df: df = df.drop(column, 1) # Normalize non-ID columns ids = [] for column in df.columns: if column != 'ID': ids.append(column) df[ids] = df[ids].apply(lambda x: (x - x.min()) / (x.max() - x.min())) return df
def extract_catalog_query(catalog_df: DataFrame, title_col: str, desc_col: str) -> str: """ Given a DataFrame of catalog data: - combines the title and description columns - combines all of the combined titles and descriptions into a single string This string will be used as the query when matching against workshops. """ df_combined = catalog_df.assign(title_desc=catalog_df[f"{title_col}"] + catalog_df[f"{desc_col}"]) # combined = df_combined["title_desc"].to_list() # return " ".join( # [strip_html_tags(content) for content in combined if type(content) == str] # ) # For now, just return a string of combined titles return " ".join(catalog_df[f"{title_col}"])
def build_full_dataset(base_dataset: DataFrame, segment_ids: list, labels: list) -> DataFrame: df_with_segmentIds = base_dataset.assign(segment_id=segment_ids) return df_with_segmentIds.assign(label=labels)