Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'maps a given high-dimensional documents to 2d document representations with t-sne'
    )
    parser.add_argument('--document-topics',
                        type=argparse.FileType('r'),
                        help='path to input document-topic-file (.npz)',
                        required=True)
    parser.add_argument('--documents-2d',
                        type=argparse.FileType('w'),
                        help='path to output document-2d-data (.npz)',
                        required=True)

    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    output_documents_2d_path = args.documents_2d.name

    document_topics = load_document_topics(input_document_topics_path)
    #model = decomposition.PCA(n_components=2)
    model = TSNE(n_components=2, verbose=1, perplexity=100, n_iter=1000)
    logger.info('running 2d-transformation with model {}'.format(model))
    documents_2d = model.fit_transform(document_topics)
    logger.debug('2d-transformation res\n{}'.format(documents_2d))

    logger.info('saving 2d-documents')
    save_npz(output_documents_2d_path, documents_2d)
def main():
    parser = argparse.ArgumentParser(description='plots given 2d-transformed documents represented by their topic distributions (optional: with clusters)')
    parser.add_argument('--documents-2d', type=argparse.FileType('r'), help='path to input document-2d-data (.npz)', required=True)
    parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input cluster labels .json.bz2 file')
    parser.add_argument('--img-file', type=argparse.FileType('w'), help='path to output im file', required=True)

    args = parser.parse_args()
    input_documents_2d_path = args.documents_2d.name
    input_cluster_labels_path = args.cluster_labels.name if args.cluster_labels else None
    output_img_path = args.img_file.name

    logger.info('loading 2d-transformed document topics')
    documents_2d = load_document_topics(input_documents_2d_path)     
    
    if input_cluster_labels_path:
        logger.info('loading cluster labels')
        cluster_labels = load_communities(input_cluster_labels_path)
        cluster_labels = np.array(cluster_labels)
    else:
        logger.info('no cluster labels given')
        cluster_labels = None
        
    logger.info('plotting 2d-documents')
    size = 1
    scatter_2d_plot(documents_2d[:,0], documents_2d[:,1], output_img_path, labels=cluster_labels, rasterized=True, size=size)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'plots 1. average probalities per topic 3. cdf of these probabilities')
    parser.add_argument('--document-topics',
                        type=argparse.FileType('r'),
                        help='path to input document-topic-file (.npz)',
                        required=True)
    parser.add_argument('--topic-avg-probs',
                        type=argparse.FileType('w'),
                        help='path to output avg prop plot file',
                        required=True)
    parser.add_argument('--topic-avg-probs-cdf',
                        type=argparse.FileType('w'),
                        help='path to output avg prob cdf plot file',
                        required=True)

    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    output_topic_avg_probs_path = args.topic_avg_probs.name
    output_topic_avg_probs_cdf_path = args.topic_avg_probs_cdf.name

    document_topics = load_document_topics(input_document_topics_path)

    logger.info('calculating average probability per topic')
    average_topic_props = np.average(document_topics, axis=0)
    logger.info('shape of average res {}'.format(average_topic_props.shape))
    average_topic_props[::-1].sort()
    logger.info('sum over averages {}'.format(average_topic_props.sum()))

    logger.info('plotting average topic probabilites')
    xlabel = 'Topic'
    ylabel = 'Ø Anteil'
    scatter_plot(average_topic_props, output_topic_avg_probs_path, xlabel,
                 ylabel)

    average_topic_props_cdf = np.cumsum(average_topic_props)
    logger.info(
        'plotting average topic probabilites cumulative distribution function')
    xlabel = 'Topic'
    ylabel = 'Ø Anteil (CDF)'
    scatter_plot(average_topic_props_cdf, output_topic_avg_probs_cdf_path,
                 xlabel, ylabel)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description='creates a file of clusterings: clusters are sorted descending by size, cluster elements are sorted by distance to cluster centroid')    
    parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True)
    parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input .json.bz2 clustering file', required=True)
    parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True)  
    parser.add_argument('--centrality-data', type=argparse.FileType('w'), help='path to output .json cluster->centrality_data file', required=True)
    parser.add_argument('--max-docs-per-clus', type=int, help='maxiumum number of highest considered nodes per cluster', required=True)
    parser.add_argument('--metric', help='calced dissimilarity to centroids (muse be allowd by cdist of scipy)', required=True)
    
    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    input_cluster_labels_path = args.cluster_labels.name
    input_titles_path = args.titles.name
    output_centrality_data_path = args.centrality_data.name
    max_docs_per_clus = args.max_docs_per_clus
    metric = args.metric
    
    logger.info('running with:\n{}'.format(pformat({'input_document_topics_path':input_document_topics_path, 'input_cluster_labels_path':input_cluster_labels_path, 'input_titles_path':input_titles_path, 'output_centrality_data_path':output_centrality_data_path, 'max_docs_per_clus':max_docs_per_clus, 'metric':metric})))
        
    document_topics = load_document_topics(input_document_topics_path)
    cluster_labels = load_communities(input_cluster_labels_path)
    document_titles = load_titles(input_titles_path)
        
    clusters = get_clusters_from_labels(cluster_labels)    
    logger.info('computing {}-centralities of {} documents in {} communities'.format(metric, len(cluster_labels), len(clusters)))
    centrality_data = {}
    for clus_id, cluster in enumerate(clusters):
        max_doc_ids, centralities = get_top_central_cluster_docs(cluster, document_topics, max_docs_per_clus, metric)
        logger.debug('max doc ids {}'.format(max_doc_ids))
        logger.debug('max doc centralities {}'.format(centralities))
        max_doc_titles = get_document_titles(max_doc_ids, document_titles)
        logger.debug('max titles: {}'.format(max_doc_titles))
        centrality_data_of_cluster = {
            'size': len(cluster),
            'titles': max_doc_titles, 
            'centralities': centralities
        }
        centrality_data[clus_id] = centrality_data_of_cluster
    
    logger.info('saving cluster centrality data (titles,centralities) of {} clusters'.format(len(centrality_data)))
    save_data_to_json(centrality_data, output_centrality_data_path)
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'plots the descending purities of each cluster (purity: highest cosine similarity to a [0,...,0,1,0,...,0] topic vector)'
    )
    parser.add_argument('--document-topics',
                        type=argparse.FileType('r'),
                        help='path to input document-topic-file (.npz)',
                        required=True)
    parser.add_argument('--cluster-labels',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 clustering file',
                        required=True)
    parser.add_argument('--plot',
                        type=argparse.FileType('w'),
                        help='path to output purity plot file',
                        required=True)

    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    input_cluster_labels_path = args.cluster_labels.name
    output_plot_path = args.plot.name

    document_topics = load_document_topics(input_document_topics_path)
    cluster_labels = load_communities(input_cluster_labels_path)

    clusters = get_clusters_from_labels(cluster_labels)
    logger.info('calculating purity of {} clusters'.format(len(clusters)))
    cluster_purities = [
        get_cluster_purity(cluster, document_topics) for cluster in clusters
    ]
    logger.info('calculated {} purity values'.format(len(cluster_purities)))

    cluster_purities = np.array(cluster_purities)
    cluster_purities[::-1].sort()

    xlabel = 'Cluster'
    ylabel = 'Reinheit'
    logger.info('plotting purities to {}'.format(output_plot_path))
    scatter_plot(cluster_purities, output_plot_path, xlabel, ylabel)
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'removes outliers of documents by a given outlier labeling file')
    parser.add_argument('--documents',
                        type=argparse.FileType('r'),
                        help='path to input documents file (.npz)',
                        required=True)
    parser.add_argument('--outlier-scores',
                        type=argparse.FileType('r'),
                        help='path to input JSON outlier scores file',
                        required=True)
    parser.add_argument('--filtered-documents',
                        type=argparse.FileType('w'),
                        help='path to output filtered documents file (.npz)',
                        required=True)
    parser.add_argument('--contamination',
                        type=float,
                        help='relative amount of most noisy samples to remove',
                        required=True)

    args = parser.parse_args()
    input_document_path = args.documents.name
    input_outlier_scores_path = args.outlier_scores.name
    output_filtered_documents_path = args.filtered_documents.name
    contamination = args.contamination

    documents = load_document_topics(input_document_path)
    outlier_scores = load_cluster_labels(input_outlier_scores_path)
    logger.info('filtering documents of shape {} with contamination {}'.format(
        documents.shape, contamination))
    filtered_documents = get_filtered_documents(documents, outlier_scores,
                                                contamination)
    logger.info('shape of filtered documents {}'.format(
        filtered_documents.shape))

    logger.info('saving filtered documents to {}'.format(
        output_filtered_documents_path))
    save_npz(output_filtered_documents_path, filtered_documents)
def main():
    parser = argparse.ArgumentParser(description='calculates local outlier factors of given documents')
    parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True)
    parser.add_argument('--outlier-scores', type=argparse.FileType('w'), help='path to output JSON outlier scores file (.json)', required=True)
    parser.add_argument('--k-min', type=int, help='minimum number of considered neighbors per sample', required=True)
    parser.add_argument('--k-max', type=int, help='maximum number of considered neighbors per sample', required=True)
    parser.add_argument('--metric', help='distance metric of outlier detection', required=True)
    
    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    output_outlier_scores_path = args.outlier_scores.name
    k_min, k_max = args.k_min, args.k_max
    metric = args.metric
    
    document_topics = load_document_topics(input_document_topics_path)    
    outlier_scores = calc_max_lof_of_bounds(document_topics, metric, k_min, k_max)
    logger.info('calculated {} LOF-scores'.format(len(outlier_scores)))
    logger.debug('scores \n{}'.format(outlier_scores))
    
    logger.info('writing scores to {}'.format(output_outlier_scores_path))
    with open(output_outlier_scores_path, 'w') as output_outlier_scores_file:
        json.dump(outlier_scores.tolist(), output_outlier_scores_file, indent=1)
def main():
    parser = argparse.ArgumentParser(
        description=
        'calculates silhouette coefficient of a given clustering and its document-topic-matrix'
    )
    parser.add_argument('--document-topics',
                        type=argparse.FileType('r'),
                        help='path to input document-topic-file (.npz)',
                        required=True)
    parser.add_argument('--cluster-labels',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 cluster labels file',
                        required=True)
    parser.add_argument('--metric',
                        choices=_VALID_METRICS,
                        help='distance function to use',
                        required=True)

    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    input_cluster_labels_path = args.cluster_labels.name
    metric = args.metric

    logger.info('loading document topics')
    document_topics = load_document_topics(input_document_topics_path)
    logger.info('loading cluster labels')
    cluster_labels = load_communities(input_cluster_labels_path)
    logger.debug(cluster_labels)

    logger.info('calclating unsupervised evaluation metrics')
    sil_score = silhouette_score(document_topics,
                                 cluster_labels,
                                 metric=metric)  # groß=gut
    logger.info('{} silhouette coefficient: {}'.format(metric, sil_score))
    ch_score = calinski_harabaz_score(
        document_topics, cluster_labels
    )  # between-scatter durch within-scatter inkl. Straftermen -> groß=gut
    logger.info('calinski harabaz score: {}'.format(ch_score))