def invalue_to_similarity(invalue_df, orientation_df):
    """
    invalue_df: converted DataFrame of user inputs
    orientation_df: DataFrame of all people of that orientation
    """

    # concat input values to orientation df to prep for cosine similarity
    df = pd.concat([orientation_df, invalue_df])

    # ohe
    df_encoded = OneHotEncoder(use_cat_names=True).fit_transform(df)

    # make cosine_similarity input (input X)
    cosine_input = pd.DataFrame(df_encoded.iloc[-1]).T

    # drop last encoded row (input Y -- data for input X to reference)
    df_encoded.drop(df_encoded.tail(1).index, inplace=True)

    # cosine_similarity(X, y)
    similarity = cosine_similarity(cosine_input, df_encoded)

    # return top 5 matches
    top5 = pd.DataFrame(similarity.tolist()[0],
                        columns=['similarity'],
                        index=df_encoded.index).sort_values(
                            by='similarity', ascending=False).iloc[:5]

    # return top 5 matches in a df with cosine similarities
    results = pd.DataFrame(columns=cupid.columns)

    for i in top5.index:
        results = results.append(pd.DataFrame(cupid.loc[i]).T)

    return results
def train_test_split(dataset, categorical_cols, train_fraction):
    """
    Splits the dataset into a train and a test set
    :param dataset: data to be split
    :param categorical_cols: list of the column names of the categorical columns (previously identified automatically)
    :param train_fraction: portion of dataset to be used as train set
    :return: a list [train set, one-hot-encoded train set, test set, one-hot-encoded test set]
    """
    dataset_encoded = OneHotEncoder(cols=categorical_cols,
                                    use_cat_names=True).fit_transform(dataset)
    train_len = int(len(dataset.index) * train_fraction)
    train_set = dataset.sample(n=train_len, random_state=1)
    train_set_encoded = dataset_encoded.loc[train_set.index].reset_index(
        drop=True)
    test_set = dataset.drop(train_set.index).reset_index(drop=True)
    test_set_encoded = dataset_encoded.drop(
        train_set.index).reset_index(drop=True)
    return train_set.reset_index(
        drop=True), train_set_encoded, test_set, test_set_encoded
Ejemplo n.º 3
0
def train_test_split(dataset, train_fraction):
    """
    Splits the dataset into a train and a test set
    :param dataset: data to be split
    :param categorical_cols: list of the column names of the categorical columns (previously identified automatically)
    :param train_fraction: portion of dataset to be used as train set
    :return: a list [train set, one-hot-encoded train set, test set, one-hot-encoded test set]
    """

    #### Default - set string values as categorial except target column   ####
    ##########################################################################
    categorical_cols = []
    for (columnName, columnData) in dataset.iteritems():
        for value in columnData.values:
            if type(value) is str:
                categorical_cols.append(columnName)
                break

    if (categorical_cols[-1] == "CLASS"):
        categorical_cols = np.delete(categorical_cols, -1)

    ## AR: improve categorial selction

    dataset_encoded = OneHotEncoder(cols=categorical_cols,
                                    use_cat_names=True).fit_transform(dataset)
    if (train_fraction == 1):
        return dataset_encoded, dataset_encoded, dataset_encoded, dataset_encoded

    train_len = int(len(dataset.index) * train_fraction)
    train_set = dataset.sample(n=train_len, random_state=1)
    train_set_encoded = dataset_encoded.loc[train_set.index].reset_index(
        drop=True)
    test_set = dataset.drop(train_set.index).reset_index(drop=True)
    test_set_encoded = dataset_encoded.drop(
        train_set.index).reset_index(drop=True)

    return train_set.reset_index(
        drop=True), train_set_encoded, test_set, test_set_encoded