コード例 #1
0
def cluster_alchemy(dataset, gamma=None, filter=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy(
            gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy()

    print 'starting clustering: found %s document and %s features' \
          % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    linkage_matrix = hr.average(tfidf_matrix.toarray())

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    print 'average f_score: %s' % params['avg_f_score']
    return params
コード例 #2
0
def cluster_dandelion_2(dataset, gamma=0.91, filter=False):
    #duplicato, mi serve solo per tornare la linkage_matrix
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion(
            gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion()

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    #linkage_matrix = hr.average(tfidf_matrix.toarray())
    linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    return linkage_matrix
コード例 #3
0
def scipy_algo(dataset, abstract=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    tfidf_matrix, f_score_dict = doc_proc.get_data(abstract)

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    #tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    print 'starting clustering after lsa: found %s document and %s features' \
          % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    linkage_matrix = hr.average(tfidf_matrix.toarray())
    #linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    print_f_score_dict(f)

    avg_f_score = average_f_score(f, tfidf_matrix.shape[0])
    print 'average f_score: %s' % avg_f_score
    return avg_f_score
コード例 #4
0
def cluster_dandelion_entities(dataset, gamma=None, filter=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_entities(
            gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_only_with_entities(
        )

    doc, features = tfidf_matrix.shape

    print 'starting clustering: found %s document and %s features' \
          % (doc, features)

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    print 'starting clustering: found %s document and %s features after LSA' \
         % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    #linkage_matrix = hr.average(tfidf_matrix.toarray())
    linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    print 'average f_score: %s' % params['avg_f_score']
    return params
コード例 #5
0
def scipy_algo(dataset):
    doc_proc = dp.DocumentsProcessor(dataset)
    tfidf_matrix, f_score_dict = doc_proc.get_data()

    linkage_matrix = hr.average(tfidf_matrix.toarray())

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    print_f_score_dict(f)

    print 'average f_score: %s' % average_f_score(f, tfidf_matrix.shape[0])
コード例 #6
0
def cluster_fabio(db, dataset, gamma=None, with_lsa=False, ranking_metric='r'):
    doc_proc = dp.DocumentsProcessor(dataset, db=db)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_fabio(
            rank_metric=ranking_metric, gamma=gamma)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_fabio(
            rank_metric=ranking_metric)

    doc, features = tfidf_matrix.shape

    print 'starting clustering: found %s document and %s features' \
          % (doc, features)

    if with_lsa:
        svd = TruncatedSVD(tfidf_matrix.shape[0])
        lsa = make_pipeline(svd, Normalizer(copy=False))

        tfidf_matrix = lsa.fit_transform(tfidf_matrix)
        linkage_matrix = hr.average(tfidf_matrix)
    else:
        linkage_matrix = hr.average(tfidf_matrix.toarray())

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    print 'average f_score: %s' % params['avg_f_score']
    return params
コード例 #7
0
def scipy_algo_cosine(dataset, abstract=False):
    '''
    al momento non riesco a clusterizzare utilizzando la cosine_similarity
    perche' sembra esserci un bug di scipy che ritorna una distanza negativa

    https://github.com/scipy/scipy/issues/5208

    :param dataset:
    :param abstract:
    :return:
    '''
    doc_proc = dp.DocumentsProcessor(dataset)
    tfidf_matrix, f_score_dict = doc_proc.get_data(abstract)

    linkage_matrix = hr.linkage(tfidf_matrix.todense(),
                                method='average',
                                metric='cosine')

    print linkage_matrix.shape
    print linkage_matrix[linkage_matrix < 0].shape
    return

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    print_f_score_dict(f)

    print 'average f_score: %s' % average_f_score(f, tfidf_matrix.shape[0])