Ejemplo n.º 1
0
def analyze_metacluster_from_config(config):
    '''
    Does analysis on metaclusters to return descriptive information and
    statistics.

    Args:
        config: a config file
    '''

    score_method = config["metacluster"]["score_method"]
    config = config["postprocessing"]
    topn_words_returned = config["topn_words_returned"]

    save_dest = config['output_data_directory']
    os.system('mkdir -p {}'.format(save_dest))

    model = uds.load_w2vec()
    ORG = uds.load_ORG_data(config["master_columns"])

    MC = uds.load_metacluster_data()
    C = MC["meta_centroids"]

    DV = uds.load_document_vectors(score_method)

    # Fix any zero vectors with random ones
    dim = DV["docv"].shape[1]
    idx = np.where(np.linalg.norm(DV["docv"], axis=1) == 0)[0]
    for i in idx:
        vec = np.random.uniform(size=(dim, ))
        vec /= np.linalg.norm(vec)
        DV["docv"][i] = vec

    # Build the results for the metaclusters
    labels = np.unique(MC["meta_labels"])

    if config["compute_dispersion"]:
        logger.info("Computing intra-document dispersion.")
        dist = _compute_dispersion_matrix(DV["docv"], MC["meta_labels"])

        # Compute the linkage and the order
        linkage = hierarchy.linkage(dist, method='average')
        d_idx = hierarchy.dendrogram(linkage, no_plot=True)["leaves"]

    else:
        # If dispersion is not calculated set d_idx to be the cluster index
        d_idx = np.sort(labels)

    #

    V = DV["docv"]
    data = []
    for cx, cluster_id in zip(C, labels):
        idx = MC["meta_labels"] == cluster_id

        item = {}
        item["counts"] = idx.sum()
        item["avg_centroid_distance"] = _compute_centroid_dist(V[idx], cx)

        if config["compute_dispersion"]:
            item["intra_document_dispersion"] = dist[cluster_id, cluster_id]
        else:
            item["intra_document_dispersion"] = -1

        # Compute closest words to the centroid
        desc = ' '.join(
            zip(*model.wv.similar_by_vector(cx, topn=topn_words_returned))[0])
        item["word2vec_description"] = desc

        data.append(item)

    df = pd.DataFrame(data, index=labels)

    df.index.name = "cluster_id"
    df["dispersion_order"] = d_idx

    cols = [
        "dispersion_order", "counts", "avg_centroid_distance",
        "intra_document_dispersion", "word2vec_description"
    ]

    df = df[cols]

    f_csv = os.path.join(save_dest, "cluster_desc.csv")
    df.to_csv(f_csv, index_label="cluster_id")

    logger.info("Computing master-label spreadsheets.")
    cluster_lookup = dict(zip(df.index, df.dispersion_order.values))
    ORG["cluster_id"] = MC["meta_labels"]
    ORG["dispersion_order"] = -1

    for i, j in cluster_lookup.items():
        idx = ORG["cluster_id"] == i
        ORG.loc[idx, "dispersion_order"] = j

    special_cols = ["_ref", "cluster_id", "dispersion_order"]
    cols = [x for x in ORG.columns if x not in special_cols]

    ORG = ORG[special_cols + cols]

    f_csv = os.path.join(save_dest, "cluster_master_labels.csv")
    ORG.to_csv(f_csv, index=False)

    print(df)  # Output the result to stdout
Ejemplo n.º 2
0
def predict_from_config(config):

    ERROR_MATRIX = {}
    PREDICTIONS = {}

    use_meta = config["predict"]['use_meta']

    # For now, we can only deal with one column using meta!
    assert(len(config["predict"]["categorical_columns"]) == 1)

    methods = uds.get_score_methods()

    pred_col = config["target_column"]

    pred_output_dir = config["predict"]["output_data_directory"]
    extra_cols = config["predict"]["extra_columns"]
    mkdir(pred_output_dir)

    # Load the categorical columns
    df = uds.load_ORG_data(config["predict"]["categorical_columns"])
    ITR = itertools.product(methods, config["predict"]["categorical_columns"])

    X_META = []

    cfg = config["predict"]
    cfg["_PARALLEL"] = config["_PARALLEL"]
    df_scores = None

    for (method, cat_col) in ITR:

        text = "Predicting [{}] [{}:{}]"
        logger.info(text.format(method, cat_col, pred_col))

        DV = uds.load_document_vectors(method)
        X = DV["docv"]

        if use_meta:
            X_META.append(X)

        Y = np.hstack(df[cat_col].values)
        counts = np.array(collections.Counter(Y).values(), dtype=float)
        counts /= counts.sum()

        msg = " Class balance for categorical prediction: {}"
        logger.info(msg.format(counts))

        # Determine the baseline prediction
        y_counts = collections.Counter(Y).values()
        baseline_score = max(y_counts) / float(sum(y_counts))

        # Predict
        scores, F1, errors, pred, dfs = categorical_predict(
            X=X,
            y_org=Y,
            method_name=method,
            use_SMOTE=int(cfg['use_SMOTE']),
            use_PARALLEL=int(cfg['_PARALLEL']),
            n_estimators=int(cfg['n_estimators']),
        )

        text = "  F1 {:0.3f}; Accuracy {:0.3f}; baseline ({:0.3f})"
        logger.info(text.format(scores.mean(), F1.mean(), baseline_score))

        PREDICTIONS[method] = pred
        ERROR_MATRIX[method] = errors

        if df_scores is None:
            df_scores = dfs
        else:
            df_scores[method] = dfs[method]

    if use_meta:
        # Build meta predictor
        # META_X = np.hstack([PREDICTIONS[method] for method
        #                    in config["predict"]["meta_methods"]])
        X_META = np.hstack(X_META)
        method = "meta"

        text = "Predicting [{}] [{}:{}]"
        logger.info(text.format(method, cat_col, pred_col))

        scores, F1, errors, pred, dfs = categorical_predict(
            X=X_META,
            y_org=Y,
            method_name=method,
            n_estimators=int(cfg['n_estimators']),
            use_PARALLEL=int(cfg['_PARALLEL']),
        )

        text = "  F1 {:0.3f}; Accuracy {:0.3f}; baseline ({:0.3f})"
        logger.info(text.format(scores.mean(), F1.mean(), baseline_score))

        PREDICTIONS[method] = pred
        ERROR_MATRIX[method] = errors
        df_scores[method] = dfs[method]

    # Save the predictions
    if extra_cols:
        df_ORG = uds.load_ORG_data(extra_columns=extra_cols)
        for col in extra_cols:
            df_scores[col] = df_ORG[col]

    f_save = os.path.join(pred_output_dir,
                          "{}_prediction.csv".format(cat_col))
    df_scores.index.name = '_ref'
    df_scores.to_csv(f_save)

    names = methods

    if use_meta:
        names += ["meta", ]

    # Plotting methods here

    df = pd.DataFrame(0, index=names, columns=names)

    max_offdiagonal = 0
    for na, nb in itertools.product(names, repeat=2):
        if na != nb:
            idx = (ERROR_MATRIX[na] == 0) * (ERROR_MATRIX[nb] == 1)
            max_offdiagonal = max(max_offdiagonal, idx.sum())
        else:
            idx = ERROR_MATRIX[na] == 0

        df[na][nb] = idx.sum()

    print(df) # Output result to stdout

    sns.heatmap(df, annot=True, vmin=0, vmax=1.2 * max_offdiagonal, fmt="d")
    plt.yticks(rotation=0)
    plt.xticks(rotation=45)

    plt.show()
            d = d[np.triu_indices(n=d.shape[0], m=d.shape[1], k=0)]

        dist[i, j] = dist[j, i] = d.mean()

    return dist


if __name__ == "__main__" and __package__ is None:

    import simple_config
    config = simple_config.load()["postprocessing"]

    save_dest = config['output_data_directory']
    os.system('mkdir -p {}'.format(save_dest))

    ORG = load_ORG_data(config["master_columns"])

    MC = load_metacluster_data()
    C = MC["meta_centroids"]
    counts = collections.Counter(MC["meta_labels"])

    DV = load_document_vectors()

    # Build the results for the metaclusters
    labels = np.unique(MC["meta_labels"])

    print("Computing intra-document dispersion.")
    dist = _compute_dispersion_matrix(DV["docv"], MC["meta_labels"])

    # Compute the linkage and the order
    linkage = hierarchy.linkage(dist, method='average')