Beispiel #1
0
def author_clustering_result(request):
    es = Elasticsearch()

    if request.GET.get("index_id") is not None and request.GET.get("index_id") != "":
        index_id = request.GET.get("index_id")

        authors = es.search(index_id, "author", body={"size": 10000, "query": {"match_all": {}}})["hits"]["hits"]
        authors = {int(author.get("_id")): author.get("_source") for author in authors}

        mapper = Mapper()
        for author in authors:
            mapper.create_sid(author)

        N = mapper.size()
        distance = numpy.zeros(shape=(N, N))

        nodes = []
        edges = []

        for author_sid_1 in range(N):
            author_uid_1 = mapper.get_uid(author_sid_1)
            nodes.append(author_uid_1)
            for author_sid_2 in range(N):
                author_uid_2 = mapper.get_uid(author_sid_2)
                co_authored_publications = set(authors[author_uid_1].get("publications")).intersection(
                    set(authors[author_uid_2].get("publications"))
                )
                if author_sid_1 < author_sid_2:
                    for i in range(len(co_authored_publications)):
                        edges.append({"from": author_uid_1, "to": author_uid_2})
                distance[author_sid_1][author_sid_2] = 1.0 / (1 + 5 * len(co_authored_publications))

        avc = AverageLinkClustering(distance)
        clusters, cluster_map = avc.cluster(N ** 0.6 * 2)

        uid_clusters = []
        uid_cluster_map = {}
        for cluster in clusters:
            uid_clusters.append(set([mapper.get_uid(x) for x in cluster]))
        for sid in cluster_map:
            uid_cluster_map[mapper.get_uid(sid)] = cluster_map[sid]

        # print(uid_clusters, uid_cluster_map)
        # print(nodes, edges)

        return render(
            request,
            "author_clustering_result.html",
            {"graph": {"nodes": nodes, "edges": edges}, "cluster_map": uid_cluster_map, "clusters": uid_clusters},
        )

    indexes = es.indices.get_mapping()
    return render(request, "author_clustering_result.html", {"indexes": indexes})
Beispiel #2
0
def calculate_pagerank_and_insert_to_elasticsearch(index_id, alpha, job_info_id):
    es = Elasticsearch()

    job_info = JobInfo.objects.get(id=job_info_id)
    job_info.save()

    job_info.info = json.dumps({'message': 'Fetching Publications From Elastic Joon ...', 'percentage': 7})
    job_info.save()

    publications = es.search(index_id, 'publication', body={"size": 10000, "query": {"match_all": {}}})['hits']['hits']
    publications = {int(publication.get('_id')): publication.get('_source') for publication in publications}

    mapper = Mapper()
    for publication in publications:
        mapper.create_sid(publication)

    N = mapper.size()
    links_graph = numpy.zeros(shape=(N, N))

    job_info.info = json.dumps({'message': 'Fetched Publications From Elastic Joon ...<br>'
                                           'Creating The Great Matrix ...', 'percentage': 35})
    job_info.save()

    initial_x = None
    for sid_1 in range(N):
        uid_1 = mapper.get_uid(sid_1)
        ones = 0
        for sid_2 in range(N):
            if sid_1 == sid_2:
                continue
            uid_2 = mapper.get_uid(sid_2)
            if uid_2 in publications[uid_1].get('references') or uid_1 in publications[uid_2].get('citations'):
                links_graph[sid_1][sid_2] = 1
                ones += 1
                if initial_x is None:
                    initial_x = sid_1
        if ones == 0:
            for sid_2 in range(N):
                links_graph[sid_1][sid_2] = 1. / N
        else:
            for sid_2 in range(N):
                links_graph[sid_1][sid_2] = links_graph[sid_1][sid_2] / ones * (1-alpha) + alpha / N
        # sum = 0
        # for sid_2 in range(N):
        #     sum += links_graph[sid_1][sid_2]
        # print(sum)

    job_info.info = json.dumps({'message': 'Multiplying ...', 'percentage': 60})
    job_info.save()

    links_graph = numpy.mat(links_graph)
    probability_vector = numpy.mat(numpy.zeros(shape=(1, N)) + 1. / N)
    # probability_vector[0, initial_x] = 1.

    # prev_prob_vec = probability_vector
    for i in range(365):
        probability_vector = probability_vector * links_graph
        # print("|x * links_graph|", numpy.sum(probability_vector))
        probability_vector = probability_vector / numpy.mat(numpy.sum(probability_vector))
        # print("|x * links_graph / |||", numpy.sum(probability_vector))
        # print(numpy.sum(prev_prob_vec - probability_vector))
        # prev_prob_vec = probability_vector

    # print(probability_vector)

    job_info.info = json.dumps({'message': 'Writing to ElasticSearch ...', 'percentage': 80})
    job_info.save()

    max_PR = numpy.matrix.max(probability_vector)

    for sid in range(N):
        uid = mapper.get_uid(sid)
        PR = probability_vector[0, sid] / max_PR
        es.update(index=index_id, doc_type='publication', id=str(uid), body={"doc": {'PR': PR}})
        es.update(index='global-index', doc_type='publication', id=str(uid), body={"doc": {'PR': PR}})

    job_info.info = json.dumps({'message': 'Fenitto ...', 'percentage': 100})
    job_info.save()