Ejemplo n.º 1
0
def select_data(config, x_train, x_dev, x_test):
    """
    Asks the user for the data to operate upon.
     - work on the full vectors (sparse = true),
     - work on a reduced data matrix (sparse = false)

    :param config: Global configuration dictionary
    :param x_train: List of train set uuids
    :param x_dev: List of dev set uuids
    :param x_test: List of test set uuids
    :return: data matrices
    """

    sparse = interaction.ask_yes_no(constants.msg_sparse)

    if sparse:
        xm_train = loader_tfidf.load_tfidf(config,
                                           x_train,
                                           dense=False,
                                           ordered=True)
        xm_dev = loader_tfidf.load_tfidf(config,
                                         x_dev,
                                         dense=False,
                                         ordered=True)
        xm_test = loader_tfidf.load_tfidf(config,
                                          x_test,
                                          dense=False,
                                          ordered=True)

    else:
        xm_train = np.loadtxt(interaction.ask_file(constants.msg_data_train))
        xm_dev = np.loadtxt(interaction.ask_file(constants.msg_data_dev))
        xm_test = np.loadtxt(interaction.ask_file(constants.msg_data_test))

    return xm_train, xm_dev, xm_test
Ejemplo n.º 2
0
def train(config, birch, rows, rand_uuids, mini_batch_size):
    """
    Train the MiniBatchKMeans clustering algorithm with fixed size data batches taken random from the input data-set.

    :param config: global configuration dictionary
    :param mini_batch_size: size of each mini batch
    :param birch: Birch object
    :param rows: number of rows of the data set matrix
    :param rand_uuids: list of documents in randomized order
    :return:
    """

    clustered = 0

    # Divide the docuements in mini batches of fixed size and train the KMeans
    while clustered < rows:
        print('Processing documents from {} to {}'.format(
            clustered, (clustered + mini_batch_size - 1)))

        data = loader_tfidf.load_tfidf(
            config,
            rand_uuids[clustered:][:mini_batch_size],
            dense=True,
            ordered=False)

        clustered += mini_batch_size

        birch.partial_fit(data)
Ejemplo n.º 3
0
def apply(config, birch, rows, uuids, mini_batch_size):
    """
    Apply the MiniBatchKMeans clustering algorithm with fixed size data batches taken in order from the input data set.

    :param config: global configuration dictionary
    :param mini_batch_size: size of each mini batch
    :param birch: Birch object
    :param rows: number of rows of the data set matrix
    :param uuids: list of documents in order
    :return: labels computed by KMeans over the data set
    """

    clustered = 0
    computed_labels = np.array([])

    # Divide the documents in mini batches of fixed size and apply KMeans on them
    while clustered < rows:
        print('Predicting documents from {} to {}'.format(
            clustered, (clustered + mini_batch_size - 1)))

        data = loader_tfidf.load_tfidf(config,
                                       uuids[clustered:][:mini_batch_size],
                                       dense=True,
                                       ordered=True)

        clustered += mini_batch_size

        batch_computed_labels = birch.predict(data)

        computed_labels = np.append(computed_labels, batch_computed_labels)

    return computed_labels
Ejemplo n.º 4
0
def select_data(config, uuids):
    """
    Asks the user for the data to operate upon.
     - work on the full vectors (sparse = true),
     - work on the full vectors with mini batches (sparse = false, mini = true)
     - work on a reduced data matrix (sparse = false, mini = false)

    :param config: Global configuration dictionary
    :param uuids: List of uuids
    :return: data matrices
    """

    sparse = interaction.ask_yes_no(constants.msg_sparse)

    if sparse:
        data = loader_tfidf.load_tfidf(config,
                                       uuids,
                                       dense=False,
                                       ordered=True)
    else:
        mini = interaction.ask_yes_no(constants.msg_mini)
        if mini:
            data = uuids
        else:
            data = np.loadtxt(interaction.ask_file(constants.msg_data_red))

    return data
Ejemplo n.º 5
0
def train_pca(config, i_pca, rows, rand_uuids, mini_batch_size):
    """
    Train the PCA algorithm incrementally using mini batches of data.    

    :param config:global configuration dictionary
    :param i_pca: IncrementalPCA object
    :param rows: number of rows of the data set matrix
    :param rand_uuids: list of documents in random order
    :param mini_batch_size: size of each mini batch
    :return:
    """

    decomposed = 0

    while decomposed < rows:
        print('Processing documents from {} to {}'.format(
            decomposed, (decomposed + mini_batch_size - 1)))
        data = loader_tfidf.load_tfidf(
            config,
            rand_uuids[decomposed:][:mini_batch_size],
            dense=True,
            ordered=False)

        decomposed += mini_batch_size

        i_pca.partial_fit(data)
Ejemplo n.º 6
0
def transform_vectors(config, i_pca, rows, uuids, mini_batch_size):
    """
    Transorm the data vectors.

    :param config:global configuration dictionary
    :param i_pca: IncrementalPCA object
    :param rows: number of rows of the data set matrix
    :param uuids: list of documents in order
    :param mini_batch_size: size of each mini batch
    :return: reduced dataset matrix
    :return: 
    """

    decomposed = 0
    new_data = []

    while decomposed < rows:
        print('Transforming documents from {} to {}'.format(
            decomposed, (decomposed + mini_batch_size - 1)))
        data = loader_tfidf.load_tfidf(config,
                                       uuids[decomposed:][:mini_batch_size],
                                       dense=True,
                                       ordered=True)

        decomposed += mini_batch_size

        new_data.append(i_pca.transform(data))

    return np.concatenate(new_data)
Ejemplo n.º 7
0
def load_selected_feats(config, uuids, selected):
    """
    Select the specified features from the data vectors by loading the data set in mini batches

    :param config: application configuration dictionary
    :param uuids: list of uuids to load
    :param selected: list of indices of selected features
    :return:
    """

    load_batch_size = config['batch_size']
    new_data = []
    t = 0

    while t < len(uuids):
        batch = loader_tfidf.load_tfidf(config, uuids[t: t + load_batch_size], dense=True, ordered=True)
        batch = np.take(batch, selected, axis=1)
        new_data.append(batch)
        t += load_batch_size

    return np.concatenate(new_data)
Ejemplo n.º 8
0
def reduce(config,
           components,
           uuids=None,
           x_train=None,
           x_dev=None,
           x_test=None):
    """
    Lower dimensionality of data vectors using tSNE.

    :param config: configuration dictionary
    :param components: number of desired components
    :param uuids: list of selected uuids
    :param x_train: List of train set uuids
    :param x_dev: List of dev set uuids
    :param x_test: List of test set uuids
    :return: 
    """

    print('Performing feature extraction using TSNE')

    tsne = TSNE(n_components=components,
                method='exact',
                early_exaggeration=6.0,
                n_iter=1000,
                n_iter_without_progress=100,
                learning_rate=1000)

    if uuids:
        train = loader_tfidf.load_tfidf(config,
                                        uuids,
                                        dense=False,
                                        ordered=True)
        data = tsne.fit_transform(train)
        rows = len(uuids)
        matrix_file = os.path.join(constants.dir_d, constants.dir_mat,
                                   'tsne_{}_{}.txt'.format(components, rows))
        np.savetxt(open(matrix_file, 'wb'), data)

    else:
        t_train = loader_tfidf.load_tfidf(config,
                                          x_train,
                                          dense=False,
                                          ordered=True)
        t_train = tsne.fit_transform(t_train)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'tsne_{}_{}_tr.txt'.format(components, len(t_train)))
        np.savetxt(open(matrix_file, 'wb'), t_train)
        rows = len(t_train)

        t_dev = loader_tfidf.load_tfidf(config,
                                        x_dev,
                                        dense=False,
                                        ordered=True)
        t_dev = tsne.fit_transform(t_dev)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'tsne_{}_{}_dv.txt'.format(components, len(t_dev)))
        np.savetxt(open(matrix_file, 'wb'), t_dev)

        t_test = loader_tfidf.load_tfidf(config,
                                         x_test,
                                         dense=False,
                                         ordered=True)
        t_test = tsne.fit_transform(t_test)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'tsne_{}_{}_te.txt'.format(components, len(t_test)))
        np.savetxt(open(matrix_file, 'wb'), t_test)

        data = (t_train, t_dev, t_test)

    model_file = os.path.join(constants.dir_d, constants.dir_mod,
                              'tsne_{}_{}.pkl'.format(components, rows))
    joblib.dump(tsne, model_file)

    return data, tsne