Beispiel #1
0
def get_split_binary_data():
    """
    Reads in the data from data/dataset.csv and returns it using
    extract_dictionary and generate_feature_matrix split into training and test sets.
    The binary labels take two values:
        -1: poor/average
         1: good
    Also returns the dictionary used to create the feature matrices.
    """
    fname = "dataset.csv"
    dataframe = load_data(fname)
    dataframe = dataframe[dataframe['label'] != 0]
    positiveDF = dataframe[dataframe['label'] == 1].copy()
    negativeDF = dataframe[dataframe['label'] == -1].copy()
    X_train = pd.concat([positiveDF[:500],
                         negativeDF[:500]]).reset_index(drop=True).copy()
    dictionary = project1.extract_dictionary(X_train)
    X_test = pd.concat([positiveDF[500:700],
                        negativeDF[500:700]]).reset_index(drop=True).copy()
    Y_train = X_train['label'].values.copy()
    Y_test = X_test['label'].values.copy()
    X_train = project1.generate_feature_matrix(X_train, dictionary)
    X_test = project1.generate_feature_matrix(X_test, dictionary)

    return (X_train, Y_train, X_test, Y_test, dictionary)
Beispiel #2
0
def get_imbalanced_data(dictionary, positive_class_size=800, ratio=0.25):
    """
    Reads in the data from data/imbalanced.csv and returns it using
    extract_dictionary and generate_feature_matrix as a tuple
    (X_train, Y_train) where the labels are binary as follows
        -1: poor/average
        1: good
    Input:
        dictionary: the dictionary created via get_split_binary_data
        positive_class_size: the size of the positive data
        ratio: ratio of negative_class_size to positive_class_size
    """
    fname = "data/imbalanced.csv"
    dataframe = load_data(fname)
    dataframe = dataframe[dataframe['label'] != 0]
    positiveDF = dataframe[dataframe['label'] == 1].copy()
    negativeDF = dataframe[dataframe['label'] == -1].copy()
    dataframe = pd.concat([
        positiveDF[:positive_class_size],
        negativeDF[:(int(positive_class_size * ratio))]
    ]).reset_index(drop=True).copy()
    X_train = project1.generate_feature_matrix(dataframe, dictionary)
    Y_train = dataframe['label'].values.copy()

    return (X_train, Y_train)
Beispiel #3
0
def get_multiclass_training_data(class_size=400):
    """
    Reads in the data from data/dataset.csv and returns it using
    extract_dictionary and generate_feature_matrix as a tuple
    (X_train, Y_train) where the labels are multiclass as follows
        -1: poor
         0: average
         1: good
    Also returns the dictionary used to create X_train.
    Input:
        class_size: Size of each class (pos/neg/neu) of training dataset.
    """
    fname = "data/dataset.csv"
    dataframe = load_data(fname)
    neutralDF = dataframe[dataframe['label'] == 0].copy()
    positiveDF = dataframe[dataframe['label'] == 1].copy()
    negativeDF = dataframe[dataframe['label'] == -1].copy()
    X_train = pd.concat([
        positiveDF[:class_size], negativeDF[:class_size],
        neutralDF[:class_size]
    ]).reset_index(drop=True).copy()
    dictionary = project1.extract_dictionary(X_train)
    Y_train = X_train['label'].values.copy()
    X_train = project1.generate_feature_matrix(X_train, dictionary)

    return (X_train, Y_train, dictionary)
Beispiel #4
0
def get_heldout_reviews(dictionary):
    """
    Reads in the data from data/heldout.csv and returns it as a feature
    matrix based on the functions extract_dictionary and generate_feature_matrix
    Input:
        dictionary: the dictionary created by get_multiclass_training_data
    """
    fname = "heldout.csv"
    dataframe = load_data(fname)
    X = project1.generate_feature_matrix(dataframe, dictionary)
    return X
Beispiel #5
0
def get_multiclass_training_data():
    """
    Reads in the data from data/dataset.csv and returns it using
    extract_dictionary and generate_feature_matrix as a tuple
    (X_train, Y_train) where the labels are multiclass as follows
        -1: poor
         0: average
         1: good
    Also returns the dictionary used to create X_train.
    """
    fname = "dataset.csv"
    dataframe = load_data(fname)
    dictionary = project1.extract_dictionary(dataframe)
    X_train = project1.generate_feature_matrix(dataframe, dictionary)
    Y_train = dataframe['label'].values.copy()

    return (X_train, Y_train, dictionary)
Beispiel #6
0
def get_imbalanced_test(dictionary):
    """
    Reads in the data from data/dataset.csv and returns a subset of it
    reflecting an imbalanced test dataset
        -1: poor/average
        1: good
    Input:
        dictionary: the dictionary created via get_split_binary_data
    """
    fname = "data/dataset.csv"
    dataframe = load_data(fname)
    dataframe = dataframe[dataframe['rating'] != 0]
    positiveDF = dataframe[dataframe['rating'] == 1].copy()
    negativeDF = dataframe[dataframe['rating'] == -1].copy()
    X_test = pd.concat([positiveDF[:400], negativeDF[:100]]).reset_index(drop=True).copy()
    Y_test = X_test['label'].values.copy()
    X_test = project1.generate_feature_matrix(X_test, dictionary)

    return (X_test, Y_test)
Beispiel #7
0
def get_imbalanced_data(dictionary):
    """
    Reads in the data from data/imbalanced.csv and returns it using
    extract_dictionary and generate_feature_matrix as a tuple
    (X_train, Y_train) where the labels are binary as follows
        -1: poor/average
        1: good
    Input:
        dictionary: the dictionary created via get_split_binary_data
    """
    fname = "data/imbalanced.csv"
    dataframe = load_data(fname)
    dataframe = dataframe[dataframe['rating'] != 0]
    positiveDF = dataframe[dataframe['rating'] == 1].copy()
    negativeDF = dataframe[dataframe['rating'] == -1].copy()
    dataframe = pd.concat([positiveDF[:800], negativeDF[:200]]).reset_index(drop=True).copy()
    X_train = project1.generate_feature_matrix(dataframe, dictionary)
    Y_train = dataframe['rating'].values.copy()

    return (X_train, Y_train)
Beispiel #8
0
def get_imbalanced_test(dictionary, positive_class_size=200, ratio=0.25):
    """
    Reads in the data from data/dataset.csv and returns a subset of it
    reflecting an imbalanced test dataset
        -1: poor/average
        1: good
    Input:
        dictionary: the dictionary created via get_split_binary_data
        positive_class_size: the size of the positive data
        ratio: ratio of negative_class_size to positive_class_size
    """
    fname = "data/dataset.csv"
    dataframe = load_data(fname)
    dataframe = dataframe[dataframe['label'] != 0]
    positiveDF = dataframe[dataframe['label'] == 1].copy()
    negativeDF = dataframe[dataframe['label'] == -1].copy()
    X_test = pd.concat([
        positiveDF[:positive_class_size],
        negativeDF[:int(positive_class_size * ratio)]
    ]).reset_index(drop=True).copy()
    Y_test = X_test['label'].values.copy()
    X_test = project1.generate_feature_matrix(X_test, dictionary)

    return (X_test, Y_test)