def invalue_to_similarity(invalue_df, orientation_df): """ invalue_df: converted DataFrame of user inputs orientation_df: DataFrame of all people of that orientation """ # concat input values to orientation df to prep for cosine similarity df = pd.concat([orientation_df, invalue_df]) # ohe df_encoded = OneHotEncoder(use_cat_names=True).fit_transform(df) # make cosine_similarity input (input X) cosine_input = pd.DataFrame(df_encoded.iloc[-1]).T # drop last encoded row (input Y -- data for input X to reference) df_encoded.drop(df_encoded.tail(1).index, inplace=True) # cosine_similarity(X, y) similarity = cosine_similarity(cosine_input, df_encoded) # return top 5 matches top5 = pd.DataFrame(similarity.tolist()[0], columns=['similarity'], index=df_encoded.index).sort_values( by='similarity', ascending=False).iloc[:5] # return top 5 matches in a df with cosine similarities results = pd.DataFrame(columns=cupid.columns) for i in top5.index: results = results.append(pd.DataFrame(cupid.loc[i]).T) return results
def train_test_split(dataset, categorical_cols, train_fraction): """ Splits the dataset into a train and a test set :param dataset: data to be split :param categorical_cols: list of the column names of the categorical columns (previously identified automatically) :param train_fraction: portion of dataset to be used as train set :return: a list [train set, one-hot-encoded train set, test set, one-hot-encoded test set] """ dataset_encoded = OneHotEncoder(cols=categorical_cols, use_cat_names=True).fit_transform(dataset) train_len = int(len(dataset.index) * train_fraction) train_set = dataset.sample(n=train_len, random_state=1) train_set_encoded = dataset_encoded.loc[train_set.index].reset_index( drop=True) test_set = dataset.drop(train_set.index).reset_index(drop=True) test_set_encoded = dataset_encoded.drop( train_set.index).reset_index(drop=True) return train_set.reset_index( drop=True), train_set_encoded, test_set, test_set_encoded
def train_test_split(dataset, train_fraction): """ Splits the dataset into a train and a test set :param dataset: data to be split :param categorical_cols: list of the column names of the categorical columns (previously identified automatically) :param train_fraction: portion of dataset to be used as train set :return: a list [train set, one-hot-encoded train set, test set, one-hot-encoded test set] """ #### Default - set string values as categorial except target column #### ########################################################################## categorical_cols = [] for (columnName, columnData) in dataset.iteritems(): for value in columnData.values: if type(value) is str: categorical_cols.append(columnName) break if (categorical_cols[-1] == "CLASS"): categorical_cols = np.delete(categorical_cols, -1) ## AR: improve categorial selction dataset_encoded = OneHotEncoder(cols=categorical_cols, use_cat_names=True).fit_transform(dataset) if (train_fraction == 1): return dataset_encoded, dataset_encoded, dataset_encoded, dataset_encoded train_len = int(len(dataset.index) * train_fraction) train_set = dataset.sample(n=train_len, random_state=1) train_set_encoded = dataset_encoded.loc[train_set.index].reset_index( drop=True) test_set = dataset.drop(train_set.index).reset_index(drop=True) test_set_encoded = dataset_encoded.drop( train_set.index).reset_index(drop=True) return train_set.reset_index( drop=True), train_set_encoded, test_set, test_set_encoded