def test_clustering(d,
                    clusters,
                    vlmcs,
                    out_directory,
                    cluster_class=MSTClustering,
                    do_draw_graph=True):
    metadata = get_metadata_for([vlmc.name for vlmc in vlmcs])

    clustering = cluster_class(vlmcs, d, metadata)
    for i in range(clusters + 0, clusters - 1, -1):
        print(i)
        clustering_metrics = clustering.cluster(i)

        if do_draw_graph:
            plot_largest_components(clustering_metrics, i, out_directory)

            pictures = [('Family', 'family'), ('Genus', 'genus'),
                        ('Host', 'hosts'), ('Baltimore', 'baltimore')]
            for name, key in pictures:
                draw_graph(clustering_metrics, name, key, i, out_directory)

        print_connected_components(clustering_metrics)
        print_cluster_metrics(clustering_metrics, i)
def test_clustering(d, vlmcs, cluster_class):
    metadata = get_metadata_for([vlmc.name for vlmc in vlmcs])

    metrics = np.zeros([len(vlmcs), 7], dtype=np.float32)

    clustering = cluster_class(vlmcs, d, metadata)
    for i in range(len(vlmcs) - 1, 0, -1):
        print(i)
        clustering_metrics = clustering.cluster(i)

        metrics[i, 0] = clustering_metrics.average_silhouette()
        metrics[i, 1] = clustering_metrics.average_percent_same_taxonomy(
            'organism')
        metrics[i,
                2] = clustering_metrics.average_percent_same_taxonomy('family')
        metrics[i,
                3] = clustering_metrics.average_percent_same_taxonomy('genus')
        fam_sensitivity, fam_specificity = clustering_metrics.sensitivity_specificity(
            'family')
        metrics[i, 4] = fam_sensitivity
        metrics[i, 5] = fam_specificity
        metrics[i, 6] = clustering_metrics.get_latest_merge_distance()

    return metrics
def test_distance_function(d,
                           tree_dir,
                           out_dir,
                           plot_distances=False,
                           plot_boxes=False):
    parse_trees_to_json.parse_trees(tree_dir)
    vlmcs = VLMC.from_json_dir(tree_dir)

    metadata = get_metadata_for([vlmc.name for vlmc in vlmcs])

    test_dir = tree_dir + "_test"
    if os.path.isdir(test_dir):
        parse_trees_to_json.parse_trees(test_dir)
        test_vlmcs = VLMC.from_json_dir(test_dir)
    else:
        test_vlmcs = vlmcs
    if out_dir is not None:
        try:
            os.stat(out_dir)
        except:
            os.mkdir(out_dir)

    return test_distance_function_(d, vlmcs, test_vlmcs, metadata, out_dir,
                                   True, False, plot_distances, plot_boxes)
Example #4
0
    parser.add_argument('--intersection', action='store_true')
    parser.add_argument('--occurrence-probability-labels', action='store_true')

    parser.add_argument(
        '--directory',
        type=str,
        default='../trees_pst_better',
        help='The directory which contains the vlmcs to be printed.')
    parser.add_argument(
        '--out-directory',
        type=str,
        default='../images',
        help='The directory to where the images should be written.')

    args = parser.parse_args()

    try:
        os.stat(args.out_directory)
    except:
        os.mkdir(args.out_directory)

    parse_trees(args.directory, args.deltas)
    vlmcs = VLMC.from_json_dir(args.directory)
    metadata = get_metadata_for([vlmc.name for vlmc in vlmcs])

    if args.intersection:
        save_intersection(vlmcs, metadata, args.out_directory)
    else:
        save(vlmcs, metadata, args.out_directory, args.deltas,
             args.occurrence_probability_labels)