Esempio n. 1
0
        def mch(font_names):
            print "mch of", len(font_names), font_names
            distances = numpy.array([self.data[i]["distance_to"][j] for i in font_names for j in font_names if i != j])

            # et for Edge Threshold
            def get_graph(et):
                g = networkx.Graph()
                g.add_nodes_from(font_names)
                edges = [(i, j) for i in font_names for j in font_names if i < j and self.data[i]["distance_to"][j] < et]
                g.add_edges_from(edges)
                return g

            solution = [font_names]  # start with one cluster and try to break it apart
            quantile = 80.0          # we want to start at 40, so double it to start
            while 2.5 < quantile and 1 == len(solution):
                quantile = quantile * 0.5
                threshold = numpy.percentile(distances, quantile)
                g = get_graph(threshold)
                _, solution = networkx_mcl(g) #expand_factor = <expand_factor>,
                                              #inflate_factor = <inflate_factor>,
                                              #max_loop = <max_loop>,
                                              #mult_factor = <mult_factor>)

            if 1 == len(solution.keys()): return solution  # nothing more we can do

            # if a cluster has more than 5 members, recurse down
            return [c if 5 >= len(c) else mch(c) for c in solution]
def mcl_cluster(G):
    M, clusters = mcl_clustering.networkx_mcl(G)
    get_clusters(M)
    pp.pprint(M)
    pp.pprint("Found {} clusters.".format(len(clusters)))
Esempio n. 3
0
    required=True,
    type=int)
parser.add_argument('-c',
                    '--clean',
                    help='clean database before executing script',
                    required=False,
                    default=False,
                    action='store_true')
parser.set_defaults(verbose=False)

if __name__ == '__main__':
    args = parser.parse_args()
    client, G, tagsToNotes = load_graph(**vars(args))
    ideasCollection = client.notes.ideas

    M, clusters = networkx_mcl(G)
    print("Found {} clusters".format(len(clusters)))
    for clique in clusters:
        hashed = hash(frozenset(clique + [
            args.weight,
        ]))

        correspondingNotes = list(
            set([note['_id'] for tag in clique for note in tagsToNotes[tag]]))
        idea = {
            'name': None,
            'tags': clique,
            'size': len(clique),
            'notes_ids': correspondingNotes,
            'notes_ids_size': len(correspondingNotes),
            'algorithm': 'mcl',