Beispiel #1
0
def visualize(samples_data, config):
    """
    Perform visualization operations

    :param samples_data: DataFrame with samples information
    :param config: dictionary containg application configuration options
    :return:
    """

    uuids = samples_data.index[samples_data['selected'] == 1].tolist()
    families = samples_data.family[samples_data['selected'] == 1].tolist()

    if interaction.ask_yes_no(constants.msg_vis_features):
        vis_cluster.plot_av_features(uuids, config)

    if interaction.ask_yes_no(constants.msg_vis_dataset):
        data_matrix = interaction.ask_file(constants.msg_vis_base)
        vis_data.plot_data(data_matrix, families)

    if interaction.ask_yes_no(constants.msg_visualize_clu):
        data = json.load(
            open(interaction.ask_file(constants.msg_results_clu), 'r'))
        y_pred = [data[uuid] for uuid in sorted(list(data.keys()))]

        if interaction.ask_yes_no(constants.msg_visualize_feature_clu):
            vis_cluster.plot_cluster_features(config, data)

        data_matrix = interaction.ask_file(constants.msg_vis_base)
        vis_data.plot_data(data_matrix, y_pred)
Beispiel #2
0
def select_data(config, x_train, x_dev, x_test):
    """
    Asks the user for the data to operate upon.
     - work on the full vectors (sparse = true),
     - work on a reduced data matrix (sparse = false)

    :param config: Global configuration dictionary
    :param x_train: List of train set uuids
    :param x_dev: List of dev set uuids
    :param x_test: List of test set uuids
    :return: data matrices
    """

    sparse = interaction.ask_yes_no(constants.msg_sparse)

    if sparse:
        xm_train = loader_tfidf.load_tfidf(config,
                                           x_train,
                                           dense=False,
                                           ordered=True)
        xm_dev = loader_tfidf.load_tfidf(config,
                                         x_dev,
                                         dense=False,
                                         ordered=True)
        xm_test = loader_tfidf.load_tfidf(config,
                                          x_test,
                                          dense=False,
                                          ordered=True)

    else:
        xm_train = np.loadtxt(interaction.ask_file(constants.msg_data_train))
        xm_dev = np.loadtxt(interaction.ask_file(constants.msg_data_dev))
        xm_test = np.loadtxt(interaction.ask_file(constants.msg_data_test))

    return xm_train, xm_dev, xm_test
Beispiel #3
0
def select_data(config, uuids):
    """
    Asks the user for the data to operate upon.
     - work on the full vectors (sparse = true),
     - work on the full vectors with mini batches (sparse = false, mini = true)
     - work on a reduced data matrix (sparse = false, mini = false)

    :param config: Global configuration dictionary
    :param uuids: List of uuids
    :return: data matrices
    """

    sparse = interaction.ask_yes_no(constants.msg_sparse)

    if sparse:
        data = loader_tfidf.load_tfidf(config,
                                       uuids,
                                       dense=False,
                                       ordered=True)
    else:
        mini = interaction.ask_yes_no(constants.msg_mini)
        if mini:
            data = uuids
        else:
            data = np.loadtxt(interaction.ask_file(constants.msg_data_red))

    return data
Beispiel #4
0
def keywords_extraction(config):
    """
    Perform keywords extraction from the clustered data

    :param config: configuration dictionary
    :return:
    """

    kws = {
        'tfidf': kw_keyword_tfidf
    }

    # Prompts the user to select an action
    kw = interaction.ask_action(constants.msg_kw, set(kws.keys()))
    if kw == 's':
        return

    result_file = interaction.ask_file(constants.msg_results_cluster)
    kw.extract_keywords(config, result_file)
Beispiel #5
0
def reduce(config,
           components,
           uuids=None,
           x_train=None,
           x_dev=None,
           x_test=None):
    """
    Use a trained random forest classifier to select a reduced number of features from the data set.

    :param config: configuration dictionary
    :param components: number of desired components
    :param uuids: list of selected uuids
    :param x_train: List of train set uuids
    :param x_dev: List of dev set uuids
    :param x_test: List of test set uuids
    :return:
    """

    words = json.load(
        open(os.path.join(constants.dir_d, constants.json_words), 'r'))
    inv_words = {value: key for key, value in words.items()}

    print('Performing feature deletion using Random Forest Classifiers')

    rfc_file = interaction.ask_file(constants.msg_data_rfc)
    rfc = joblib.load(rfc_file)

    if uuids:
        n_uuid = len(uuids)
    else:
        n_uuid = len(x_train)

    selected_feats = get_least_important_feats(rfc.feature_importances_,
                                               components, inv_words, n_uuid)

    if uuids:
        data = load_selected_feats(config, uuids, selected_feats)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'irfc_{}_{}.txt'.format(components, len(uuids)))
        np.savetxt(open(matrix_file, 'wb'), data)

    else:
        t_train = load_selected_feats(config, x_train, selected_feats)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'irfc_{}_{}_tr.txt'.format(components, len(t_train)))
        np.savetxt(open(matrix_file, 'wb'), t_train)

        t_dev = load_selected_feats(config, x_dev, selected_feats)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'irfc_{}_{}_dv.txt'.format(components, len(t_dev)))
        np.savetxt(open(matrix_file, 'wb'), t_dev)

        t_test = load_selected_feats(config, x_test, selected_feats)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'irfc_{}_{}_te.txt'.format(components, len(t_test)))
        np.savetxt(open(matrix_file, 'wb'), t_test)

        data = (t_train, t_dev, t_test)

    return data, rfc