def author_clustering_result(request): es = Elasticsearch() if request.GET.get("index_id") is not None and request.GET.get("index_id") != "": index_id = request.GET.get("index_id") authors = es.search(index_id, "author", body={"size": 10000, "query": {"match_all": {}}})["hits"]["hits"] authors = {int(author.get("_id")): author.get("_source") for author in authors} mapper = Mapper() for author in authors: mapper.create_sid(author) N = mapper.size() distance = numpy.zeros(shape=(N, N)) nodes = [] edges = [] for author_sid_1 in range(N): author_uid_1 = mapper.get_uid(author_sid_1) nodes.append(author_uid_1) for author_sid_2 in range(N): author_uid_2 = mapper.get_uid(author_sid_2) co_authored_publications = set(authors[author_uid_1].get("publications")).intersection( set(authors[author_uid_2].get("publications")) ) if author_sid_1 < author_sid_2: for i in range(len(co_authored_publications)): edges.append({"from": author_uid_1, "to": author_uid_2}) distance[author_sid_1][author_sid_2] = 1.0 / (1 + 5 * len(co_authored_publications)) avc = AverageLinkClustering(distance) clusters, cluster_map = avc.cluster(N ** 0.6 * 2) uid_clusters = [] uid_cluster_map = {} for cluster in clusters: uid_clusters.append(set([mapper.get_uid(x) for x in cluster])) for sid in cluster_map: uid_cluster_map[mapper.get_uid(sid)] = cluster_map[sid] # print(uid_clusters, uid_cluster_map) # print(nodes, edges) return render( request, "author_clustering_result.html", {"graph": {"nodes": nodes, "edges": edges}, "cluster_map": uid_cluster_map, "clusters": uid_clusters}, ) indexes = es.indices.get_mapping() return render(request, "author_clustering_result.html", {"indexes": indexes})
def calculate_pagerank_and_insert_to_elasticsearch(index_id, alpha, job_info_id): es = Elasticsearch() job_info = JobInfo.objects.get(id=job_info_id) job_info.save() job_info.info = json.dumps({'message': 'Fetching Publications From Elastic Joon ...', 'percentage': 7}) job_info.save() publications = es.search(index_id, 'publication', body={"size": 10000, "query": {"match_all": {}}})['hits']['hits'] publications = {int(publication.get('_id')): publication.get('_source') for publication in publications} mapper = Mapper() for publication in publications: mapper.create_sid(publication) N = mapper.size() links_graph = numpy.zeros(shape=(N, N)) job_info.info = json.dumps({'message': 'Fetched Publications From Elastic Joon ...<br>' 'Creating The Great Matrix ...', 'percentage': 35}) job_info.save() initial_x = None for sid_1 in range(N): uid_1 = mapper.get_uid(sid_1) ones = 0 for sid_2 in range(N): if sid_1 == sid_2: continue uid_2 = mapper.get_uid(sid_2) if uid_2 in publications[uid_1].get('references') or uid_1 in publications[uid_2].get('citations'): links_graph[sid_1][sid_2] = 1 ones += 1 if initial_x is None: initial_x = sid_1 if ones == 0: for sid_2 in range(N): links_graph[sid_1][sid_2] = 1. / N else: for sid_2 in range(N): links_graph[sid_1][sid_2] = links_graph[sid_1][sid_2] / ones * (1-alpha) + alpha / N # sum = 0 # for sid_2 in range(N): # sum += links_graph[sid_1][sid_2] # print(sum) job_info.info = json.dumps({'message': 'Multiplying ...', 'percentage': 60}) job_info.save() links_graph = numpy.mat(links_graph) probability_vector = numpy.mat(numpy.zeros(shape=(1, N)) + 1. / N) # probability_vector[0, initial_x] = 1. # prev_prob_vec = probability_vector for i in range(365): probability_vector = probability_vector * links_graph # print("|x * links_graph|", numpy.sum(probability_vector)) probability_vector = probability_vector / numpy.mat(numpy.sum(probability_vector)) # print("|x * links_graph / |||", numpy.sum(probability_vector)) # print(numpy.sum(prev_prob_vec - probability_vector)) # prev_prob_vec = probability_vector # print(probability_vector) job_info.info = json.dumps({'message': 'Writing to ElasticSearch ...', 'percentage': 80}) job_info.save() max_PR = numpy.matrix.max(probability_vector) for sid in range(N): uid = mapper.get_uid(sid) PR = probability_vector[0, sid] / max_PR es.update(index=index_id, doc_type='publication', id=str(uid), body={"doc": {'PR': PR}}) es.update(index='global-index', doc_type='publication', id=str(uid), body={"doc": {'PR': PR}}) job_info.info = json.dumps({'message': 'Fenitto ...', 'percentage': 100}) job_info.save()