Example #1
0
	def plot(self, feat_range, random_sampling, corpus_size):

		# This is the standard number of neighbors. This cannot change unless the code changes.
		n_nbrs = 4

		# 3 neighbors for each sample is argued to make up enough consensus
		# Try to make a consensus of distance measures
		# Use cosine, euclidean and manhattan distance, and make consensus tree (inspired by Eder)
		# Also search over ranges of features to make the visualization less biased

		metric_dictionary = {'manhattan': 'manhattan', 'cosine': 'cosine', 'euclidean': 'euclidean'}

		authors, titles, texts = DataReader(self.folder_location, self.sample_size, {}, {}
										).metadata(sampling=True,
										type='folder',
										randomization=False)

		# random Stratified Sampling 
		# each sample receives its sampling fraction corresponding to proportionate number of samples

		corpus_size = corpus_size*1000

		if random_sampling == 'stratified':
			strata_proportions = {title.split('_')[0]: np.int(np.round(int(title.split('_')[-1]) / len(titles) * corpus_size / self.sample_size)) for title in titles}
			# print('::: corpus is being stratified to {} words in following proportions : '.format(str(corpus_size)))
			# print(strata_proportions, ' :::')
			strat_titles = []
			for stratum in strata_proportions:
				strata = [title for title in titles if stratum == title.split('_')[0]]
				sampling_fraction = strata_proportions[stratum]
				local_rand_strat_titles = random.sample(strata, sampling_fraction)
				strat_titles.append(local_rand_strat_titles)
			strat_titles = sum(strat_titles, [])
			strat_authors = [author for author, title in zip(authors, titles) if title in strat_titles]
			strat_texts = [text for title, text in zip(titles, texts) if title in strat_titles]
			titles = strat_titles
			authors = strat_authors
			texts = strat_texts

		fob_nodes = open(os.path.dirname(os.getcwd()) + "/gephi_nodes.txt", "w")
		fob_edges = open(os.path.dirname(os.getcwd()) + "/gephi_edges.txt", "w")

		fob_nodes.write("Id" + "\t" + "Work" + "\t" + "Author" + "\n")
		fob_edges.write("Source" + "\t" + "Target" + "\t" + "Type" + "\t" + "Weight" + "\n")

		# Build up consensus distances of different feature ranges and different metrics
		exhsearch_data = []
		for n_feats in feat_range:
			# print("::: running through feature range {} ::: ".format(str(n_feats)))
			tfidf_vectors, tfidf_features = Vectorizer(texts, self.invalid_words,
										  n_feats=n_feats,
										  feat_scaling='standard_scaler',
										  analyzer='word',
										  vocab=None
										  ).tfidf(smoothing=True)
			if n_feats == feat_range[-1]:
				pass
				# print("FEATURES: ", ", ".join(tfidf_features))
			for metric in metric_dictionary:
				model = NearestNeighbors(n_neighbors=n_nbrs,
										algorithm='brute',
										metric=metric_dictionary[metric],
										).fit(tfidf_vectors)
				distances, indices = model.kneighbors(tfidf_vectors)
				
				# Distances are normalized in order for valid ground for comparison
				all_distances = []
				for distance_vector in distances:
					for value in distance_vector:
						if value != 0.0:
							all_distances.append(value)

				all_distances = np.array(all_distances)
				highest_value = all_distances[np.argmin(all_distances)]
				lowest_value = all_distances[np.argmax(all_distances)]
				normalized_distances = (distances - lowest_value) / (highest_value - lowest_value)
				
				# Distances appended to dataframe
				for distance_vec, index_vec in zip(normalized_distances, indices):
					data_tup = ('{} feats, {}'.format(str(n_feats), metric_dictionary[metric]),
								titles[index_vec[0]], 
								titles[index_vec[1]], distance_vec[1],
								titles[index_vec[2]], distance_vec[2],
								titles[index_vec[3]], distance_vec[3])
					exhsearch_data.append(data_tup)

		# Entire collected dataframe
		df = pd.DataFrame(exhsearch_data, columns=['exp', 'node', 'neighbor 1', 'dst 1', 'neighbor 2', 
										 'dst 2', 'neighbor 3', 'dst 3']).sort_values(by='node', ascending=0)
		final_data = []
		weights= []
		node_orientation  = {title: idx+1 for idx, title in enumerate(titles)}
		for idx, (author, title) in enumerate(zip(authors, titles)):
			neighbors = []
			dsts = []
			# Pool all neighbors and distances together (ignore ranking of nb1, nb2, etc.)
			for num in range(1, n_nbrs):
				neighbors.append([neighb for neighb in df[df['node']==title]['neighbor {}'.format(str(num))]])
				dsts.append([neighb for neighb in df[df['node']==title]['dst {}'.format(str(num))]])
			neighbors = sum(neighbors, [])
			dsts = sum(dsts, [])

			# Token pattern in order for hyphenated title names not to become split up
			pattern = "(?u)\\b[\\w-]+\\b"
			model = CountVectorizer(lowercase=False, token_pattern=pattern)
			count_dict = model.fit_transform(neighbors)

			# Collect all the candidates per sample that were chosen by the algorithm as nearest neighbor at least once
			candidate_dict = {neighbor: [] for neighbor in model.get_feature_names()}
			for nbr, dst in zip(neighbors, dsts):
				candidate_dict[nbr].append(dst)
			candidate_dict = {nbr: np.mean(candidate_dict[nbr])*len(candidate_dict[nbr]) for nbr in candidate_dict}
			candidate_dict = sorted(candidate_dict.items(), key=lambda x: x[1], reverse=True)

			fob_nodes.write(str(idx + 1) + "\t" + str(title.split('_')[-1]) + "\t" + str(author) + "\n")
			data_tup = (title,)
			for candtitle, weight in candidate_dict[:8]:
				data_tup = data_tup + (candtitle, weight,)
				weights.append(weight)
				fob_edges.write(str(idx+1) + "\t" + str(node_orientation[candtitle]) + "\t" + "Undirected" + "\t" + str(weight) + "\n")
			final_data.append(data_tup)

		# Prepare column names for dataframe
		longest = np.int((len(final_data[np.argmax([len(i) for i in final_data])]) - 1) / 2)
		columns = sum([['neighbor {}'.format(str(i)), 'dst {}'.format(str(i))] for i in range(1, longest+1)], [])
		columns.insert(0, 'node')
		final_df = pd.DataFrame(final_data, columns=columns).sort_values(by='node', ascending=0)