def cluster_docs(): timer = Timer() timer.start() from clustering.K_means import K_means c = K_means() c.clusterDocs() timer.end() return render_template('clustering_result.html', duration=timer.get_time_taken_pretty(), numclusters=len(c.centroidList) )
def page_rank(): timer = Timer() timer.start() from pageRank.PageRank import PageRank c = PageRank() c.pageRank() timer.end() return render_template('pagerank_result.html', duration=timer.get_time_taken_pretty() )
def _generic_index(retrieved_path): timer = Timer() timer.start() api = IndexingAPI(ELASTIC_URL, retrieved_path) response = api.bulk_add_documents_in_directory(retrieved_path, INDEX_NAME, DOCUMENT_TYPE).json() success = not response['errors'] num_docs = len(response['items']) pretty_response = json.dumps(response, indent=True) timer.end() return render_template('indexing_result.html', duration=timer.get_time_taken_pretty(), elastic_response=pretty_response, success=success, numdocs=num_docs )
def author_cluster_admin(): timer = Timer() timer.start() authors = list() for file in list_files(AUTHOR_CLUSTER_SOURCE_DIRECTORY, '*.json'): with open(os.path.join(AUTHOR_CLUSTER_SOURCE_DIRECTORY, file), 'r') as fp: author_data = json.load(fp) authors.append(Author(author_data)) from clustering.authors_cluster import Dendogram clusters = Dendogram(authors) clusters.cluster() min_similarity = 0.375 cluster_list = list(map( lambda cluster: list(map(lambda x: x.name, cluster)), map( lambda x: list(x.authors), clusters.get_clusters(min_similarity) ) )) cluster_dict = dict() for cluster in cluster_list: for author in cluster: cluster_dict[author] = cluster with open(AUTHOR_CLUSTER_FILE, 'w') as fp: json.dump(cluster_dict, fp) timer.end() return render_template('indexing_result.html', duration=timer.get_time_taken_pretty(), elastic_response=json.dumps(cluster_list, indent=True), success=True, numdocs=len(cluster_list) )