def plot(self, feat_range, random_sampling, corpus_size): # This is the standard number of neighbors. This cannot change unless the code changes. n_nbrs = 4 # 3 neighbors for each sample is argued to make up enough consensus # Try to make a consensus of distance measures # Use cosine, euclidean and manhattan distance, and make consensus tree (inspired by Eder) # Also search over ranges of features to make the visualization less biased metric_dictionary = {'manhattan': 'manhattan', 'cosine': 'cosine', 'euclidean': 'euclidean'} authors, titles, texts = DataReader(self.folder_location, self.sample_size, {}, {} ).metadata(sampling=True, type='folder', randomization=False) # random Stratified Sampling # each sample receives its sampling fraction corresponding to proportionate number of samples corpus_size = corpus_size*1000 if random_sampling == 'stratified': strata_proportions = {title.split('_')[0]: np.int(np.round(int(title.split('_')[-1]) / len(titles) * corpus_size / self.sample_size)) for title in titles} # print('::: corpus is being stratified to {} words in following proportions : '.format(str(corpus_size))) # print(strata_proportions, ' :::') strat_titles = [] for stratum in strata_proportions: strata = [title for title in titles if stratum == title.split('_')[0]] sampling_fraction = strata_proportions[stratum] local_rand_strat_titles = random.sample(strata, sampling_fraction) strat_titles.append(local_rand_strat_titles) strat_titles = sum(strat_titles, []) strat_authors = [author for author, title in zip(authors, titles) if title in strat_titles] strat_texts = [text for title, text in zip(titles, texts) if title in strat_titles] titles = strat_titles authors = strat_authors texts = strat_texts fob_nodes = open(os.path.dirname(os.getcwd()) + "/gephi_nodes.txt", "w") fob_edges = open(os.path.dirname(os.getcwd()) + "/gephi_edges.txt", "w") fob_nodes.write("Id" + "\t" + "Work" + "\t" + "Author" + "\n") fob_edges.write("Source" + "\t" + "Target" + "\t" + "Type" + "\t" + "Weight" + "\n") # Build up consensus distances of different feature ranges and different metrics exhsearch_data = [] for n_feats in feat_range: # print("::: running through feature range {} ::: ".format(str(n_feats))) tfidf_vectors, tfidf_features = Vectorizer(texts, self.invalid_words, n_feats=n_feats, feat_scaling='standard_scaler', analyzer='word', vocab=None ).tfidf(smoothing=True) if n_feats == feat_range[-1]: pass # print("FEATURES: ", ", ".join(tfidf_features)) for metric in metric_dictionary: model = NearestNeighbors(n_neighbors=n_nbrs, algorithm='brute', metric=metric_dictionary[metric], ).fit(tfidf_vectors) distances, indices = model.kneighbors(tfidf_vectors) # Distances are normalized in order for valid ground for comparison all_distances = [] for distance_vector in distances: for value in distance_vector: if value != 0.0: all_distances.append(value) all_distances = np.array(all_distances) highest_value = all_distances[np.argmin(all_distances)] lowest_value = all_distances[np.argmax(all_distances)] normalized_distances = (distances - lowest_value) / (highest_value - lowest_value) # Distances appended to dataframe for distance_vec, index_vec in zip(normalized_distances, indices): data_tup = ('{} feats, {}'.format(str(n_feats), metric_dictionary[metric]), titles[index_vec[0]], titles[index_vec[1]], distance_vec[1], titles[index_vec[2]], distance_vec[2], titles[index_vec[3]], distance_vec[3]) exhsearch_data.append(data_tup) # Entire collected dataframe df = pd.DataFrame(exhsearch_data, columns=['exp', 'node', 'neighbor 1', 'dst 1', 'neighbor 2', 'dst 2', 'neighbor 3', 'dst 3']).sort_values(by='node', ascending=0) final_data = [] weights= [] node_orientation = {title: idx+1 for idx, title in enumerate(titles)} for idx, (author, title) in enumerate(zip(authors, titles)): neighbors = [] dsts = [] # Pool all neighbors and distances together (ignore ranking of nb1, nb2, etc.) for num in range(1, n_nbrs): neighbors.append([neighb for neighb in df[df['node']==title]['neighbor {}'.format(str(num))]]) dsts.append([neighb for neighb in df[df['node']==title]['dst {}'.format(str(num))]]) neighbors = sum(neighbors, []) dsts = sum(dsts, []) # Token pattern in order for hyphenated title names not to become split up pattern = "(?u)\\b[\\w-]+\\b" model = CountVectorizer(lowercase=False, token_pattern=pattern) count_dict = model.fit_transform(neighbors) # Collect all the candidates per sample that were chosen by the algorithm as nearest neighbor at least once candidate_dict = {neighbor: [] for neighbor in model.get_feature_names()} for nbr, dst in zip(neighbors, dsts): candidate_dict[nbr].append(dst) candidate_dict = {nbr: np.mean(candidate_dict[nbr])*len(candidate_dict[nbr]) for nbr in candidate_dict} candidate_dict = sorted(candidate_dict.items(), key=lambda x: x[1], reverse=True) fob_nodes.write(str(idx + 1) + "\t" + str(title.split('_')[-1]) + "\t" + str(author) + "\n") data_tup = (title,) for candtitle, weight in candidate_dict[:8]: data_tup = data_tup + (candtitle, weight,) weights.append(weight) fob_edges.write(str(idx+1) + "\t" + str(node_orientation[candtitle]) + "\t" + "Undirected" + "\t" + str(weight) + "\n") final_data.append(data_tup) # Prepare column names for dataframe longest = np.int((len(final_data[np.argmax([len(i) for i in final_data])]) - 1) / 2) columns = sum([['neighbor {}'.format(str(i)), 'dst {}'.format(str(i))] for i in range(1, longest+1)], []) columns.insert(0, 'node') final_df = pd.DataFrame(final_data, columns=columns).sort_values(by='node', ascending=0)