Example #1
0
def fast_knn(X,
             n_clusters=5,
             n_neighbors=None,
             graph_mode='distance',
             cluster_mode='spectral',
             algorithm='brute',
             n_jobs=1,
             random_state=1234,
             force_sklearn=False):
    r"""
  Arguments:
    X : `ndarray` or tuple of (X, y)
    n_neighbors: int (default = 5)
      The top K closest datapoints you want the algorithm to return.
      Currently, this value must be < 1024.
    graph_mode : {'distance', 'connectivity'}, default='distance'
      This mode decides which values `kneighbors_graph` will return:
        - 'connectivity' : will return the connectivity matrix with ones and
          zeros (for 'SpectralClustering').
        - 'distance' : will return the distances between neighbors according
          to the given metric (for 'DBSCAN').
    cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote'
        This mode decides how to generate cluster prediction from the
        neighbors graph:
        - 'dbscan' :
        - 'spectral' :
        - 'isomap' :
        - 'kmeans' :
    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors:
        - 'ball_tree' will use :class:`BallTree`
        - 'kd_tree' will use :class:`KDTree`
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.
        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.
  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    force_sklearn = kwargs.pop('force_sklearn')
    random_state = kwargs.pop('random_state')
    n_clusters = int(kwargs.pop('n_clusters'))
    if n_neighbors is None:
        kwargs['n_neighbors'] = n_clusters
        n_neighbors = n_clusters
    ## graph mode
    graph_mode = str(kwargs.pop('graph_mode')).strip().lower()
    assert graph_mode in ('distance', 'connectivity')
    ## cluster mode
    cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower()
    ## fine-tuning the kwargs
    use_cuml = _check_cuml(force_sklearn)
    if use_cuml:
        from cuml.neighbors import NearestNeighbors
        kwargs['n_gpus'] = kwargs['n_jobs']
        kwargs.pop('n_jobs')
        kwargs.pop('algorithm')
    else:
        from sklearn.neighbors import NearestNeighbors
    ## fitting
    knn = NearestNeighbors(**kwargs)
    knn.fit(X)
    knn._fitid = id(X)
    ## Transform mode
    knn._random_state = random_state
    knn._n_clusters = n_clusters
    knn._graph_mode = graph_mode
    knn._cluster_mode = cluster_mode
    if use_cuml:
        knn.n_samples_fit_ = X.shape[0]
    knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn)
    knn.transform = types.MethodType(nn_transform, knn)
    knn.fit_transform = types.MethodType(nn_fit_transform, knn)
    knn.predict = types.MethodType(nn_predict, knn)
    return knn
Example #2
0
	def plot(self, feat_range, random_sampling, corpus_size):

		# This is the standard number of neighbors. This cannot change unless the code changes.
		n_nbrs = 4

		# 3 neighbors for each sample is argued to make up enough consensus
		# Try to make a consensus of distance measures
		# Use cosine, euclidean and manhattan distance, and make consensus tree (inspired by Eder)
		# Also search over ranges of features to make the visualization less biased

		metric_dictionary = {'manhattan': 'manhattan', 'cosine': 'cosine', 'euclidean': 'euclidean'}

		authors, titles, texts = DataReader(self.folder_location, self.sample_size, {}, {}
										).metadata(sampling=True,
										type='folder',
										randomization=False)

		# random Stratified Sampling 
		# each sample receives its sampling fraction corresponding to proportionate number of samples

		corpus_size = corpus_size*1000

		if random_sampling == 'stratified':
			strata_proportions = {title.split('_')[0]: np.int(np.round(int(title.split('_')[-1]) / len(titles) * corpus_size / self.sample_size)) for title in titles}
			# print('::: corpus is being stratified to {} words in following proportions : '.format(str(corpus_size)))
			# print(strata_proportions, ' :::')
			strat_titles = []
			for stratum in strata_proportions:
				strata = [title for title in titles if stratum == title.split('_')[0]]
				sampling_fraction = strata_proportions[stratum]
				local_rand_strat_titles = random.sample(strata, sampling_fraction)
				strat_titles.append(local_rand_strat_titles)
			strat_titles = sum(strat_titles, [])
			strat_authors = [author for author, title in zip(authors, titles) if title in strat_titles]
			strat_texts = [text for title, text in zip(titles, texts) if title in strat_titles]
			titles = strat_titles
			authors = strat_authors
			texts = strat_texts

		fob_nodes = open(os.path.dirname(os.getcwd()) + "/gephi_nodes.txt", "w")
		fob_edges = open(os.path.dirname(os.getcwd()) + "/gephi_edges.txt", "w")

		fob_nodes.write("Id" + "\t" + "Work" + "\t" + "Author" + "\n")
		fob_edges.write("Source" + "\t" + "Target" + "\t" + "Type" + "\t" + "Weight" + "\n")

		# Build up consensus distances of different feature ranges and different metrics
		exhsearch_data = []
		for n_feats in feat_range:
			# print("::: running through feature range {} ::: ".format(str(n_feats)))
			tfidf_vectors, tfidf_features = Vectorizer(texts, self.invalid_words,
										  n_feats=n_feats,
										  feat_scaling='standard_scaler',
										  analyzer='word',
										  vocab=None
										  ).tfidf(smoothing=True)
			if n_feats == feat_range[-1]:
				pass
				# print("FEATURES: ", ", ".join(tfidf_features))
			for metric in metric_dictionary:
				model = NearestNeighbors(n_neighbors=n_nbrs,
										algorithm='brute',
										metric=metric_dictionary[metric],
										).fit(tfidf_vectors)
				distances, indices = model.kneighbors(tfidf_vectors)
				
				# Distances are normalized in order for valid ground for comparison
				all_distances = []
				for distance_vector in distances:
					for value in distance_vector:
						if value != 0.0:
							all_distances.append(value)

				all_distances = np.array(all_distances)
				highest_value = all_distances[np.argmin(all_distances)]
				lowest_value = all_distances[np.argmax(all_distances)]
				normalized_distances = (distances - lowest_value) / (highest_value - lowest_value)
				
				# Distances appended to dataframe
				for distance_vec, index_vec in zip(normalized_distances, indices):
					data_tup = ('{} feats, {}'.format(str(n_feats), metric_dictionary[metric]),
								titles[index_vec[0]], 
								titles[index_vec[1]], distance_vec[1],
								titles[index_vec[2]], distance_vec[2],
								titles[index_vec[3]], distance_vec[3])
					exhsearch_data.append(data_tup)

		# Entire collected dataframe
		df = pd.DataFrame(exhsearch_data, columns=['exp', 'node', 'neighbor 1', 'dst 1', 'neighbor 2', 
										 'dst 2', 'neighbor 3', 'dst 3']).sort_values(by='node', ascending=0)
		final_data = []
		weights= []
		node_orientation  = {title: idx+1 for idx, title in enumerate(titles)}
		for idx, (author, title) in enumerate(zip(authors, titles)):
			neighbors = []
			dsts = []
			# Pool all neighbors and distances together (ignore ranking of nb1, nb2, etc.)
			for num in range(1, n_nbrs):
				neighbors.append([neighb for neighb in df[df['node']==title]['neighbor {}'.format(str(num))]])
				dsts.append([neighb for neighb in df[df['node']==title]['dst {}'.format(str(num))]])
			neighbors = sum(neighbors, [])
			dsts = sum(dsts, [])

			# Token pattern in order for hyphenated title names not to become split up
			pattern = "(?u)\\b[\\w-]+\\b"
			model = CountVectorizer(lowercase=False, token_pattern=pattern)
			count_dict = model.fit_transform(neighbors)

			# Collect all the candidates per sample that were chosen by the algorithm as nearest neighbor at least once
			candidate_dict = {neighbor: [] for neighbor in model.get_feature_names()}
			for nbr, dst in zip(neighbors, dsts):
				candidate_dict[nbr].append(dst)
			candidate_dict = {nbr: np.mean(candidate_dict[nbr])*len(candidate_dict[nbr]) for nbr in candidate_dict}
			candidate_dict = sorted(candidate_dict.items(), key=lambda x: x[1], reverse=True)

			fob_nodes.write(str(idx + 1) + "\t" + str(title.split('_')[-1]) + "\t" + str(author) + "\n")
			data_tup = (title,)
			for candtitle, weight in candidate_dict[:8]:
				data_tup = data_tup + (candtitle, weight,)
				weights.append(weight)
				fob_edges.write(str(idx+1) + "\t" + str(node_orientation[candtitle]) + "\t" + "Undirected" + "\t" + str(weight) + "\n")
			final_data.append(data_tup)

		# Prepare column names for dataframe
		longest = np.int((len(final_data[np.argmax([len(i) for i in final_data])]) - 1) / 2)
		columns = sum([['neighbor {}'.format(str(i)), 'dst {}'.format(str(i))] for i in range(1, longest+1)], [])
		columns.insert(0, 'node')
		final_df = pd.DataFrame(final_data, columns=columns).sort_values(by='node', ascending=0)