def retrieve_graph_lda_data(): graph_file = 'cache/%s/graph.pkl' % THE.permitted vectorizer_file = 'cache/%s/vectorizer.pkl' % THE.permitted doc_2_vec_file = 'cache/%s/doc_2_vec.pkl' % THE.permitted documents_file = 'cache/%s/documents.pkl' % THE.permitted lda_model_file = 'cache/%s/lda_model.pkl' % THE.permitted vocabulary_file = 'cache/%s/vocabulary.pkl' % THE.permitted if os.path.isfile(graph_file) and os.path.isfile(vectorizer_file) \ and os.path.isfile(doc_2_vec_file) and os.path.isfile(documents_file) \ and os.path.isfile(lda_model_file) and os.path.isfile(vocabulary_file): with open(graph_file) as f: graph = pkl.load(f) miner = Miner(graph) with open(vectorizer_file) as f: miner.vectorizer = pkl.load(f) with open(doc_2_vec_file) as f: miner.doc_2_vec = joblib.load(f) with open(documents_file) as f: miner.documents = pkl.load(f) with open(lda_model_file) as f: lda_model = pkl.load(f) with open(vocabulary_file) as f: vocab = pkl.load(f) else: miner, graph, lda_model, vocab = store_graph_lda_data() return miner, graph, lda_model, vocab
def retrieve_graph_lda_data(): """ Fetch stored metadata :return: """ graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted) vectorizer_file = 'cache/%s/%s/vectorizer.pkl' % (THE.version, THE.permitted) doc_2_vec_file = 'cache/%s/%s/doc_2_vec.pkl' % (THE.version, THE.permitted) documents_file = 'cache/%s/%s/documents.pkl' % (THE.version, THE.permitted) lda_model_file = 'cache/%s/%s/lda_model.pkl' % (THE.version, THE.permitted) vocabulary_file = 'cache/%s/%s/vocabulary.pkl' % (THE.version, THE.permitted) if os.path.isfile(graph_file) and os.path.isfile(vectorizer_file) \ and os.path.isfile(doc_2_vec_file) and os.path.isfile(documents_file) \ and os.path.isfile(lda_model_file) and os.path.isfile(vocabulary_file): with open(graph_file) as f: graph = cPkl.load(f) miner = Miner(graph) with open(vectorizer_file) as f: miner.vectorizer = cPkl.load(f) with open(doc_2_vec_file) as f: miner.doc_2_vec = joblib.load(f) with open(documents_file) as f: miner.documents = cPkl.load(f) with open(lda_model_file) as f: lda_model = cPkl.load(f) with open(vocabulary_file) as f: vocab = cPkl.load(f) else: miner, graph, lda_model, vocab = store_graph_lda_data() return miner, graph, lda_model, vocab
def get_top_papers(): n_topics = 7 top_papers = {} for index in range(n_topics): top_papers[index] = [] graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) for paper_id, paper in graph.paper_nodes.items(): topics = miner.documents[paper_id].topics_count # if int(paper.year) < 2009: continue if max(topics) == 0: continue topic = topics.argmax() # cites = len(paper.cites.split(",")) if paper.cites else 0 cites = paper.local_cites top_papers[topic].append([(cites, paper.title, paper.authors, paper.year)]) for index in range(n_topics): top_papers[index] = sorted(top_papers[index], reverse=True)[:4] print("***", index, "***") for paper in top_papers[index]: paper = paper[0] print(paper[0], paper[-1] + " - " + paper[1] + ", " + paper[2])
def retrieve_graph_lda_data(): """ Fetch stored metadata :return: """ graph_file = 'cache/%s/%s/graph.pkl' % (THE.version, THE.permitted) vectorizer_file = 'cache/%s/%s/vectorizer.pkl' % (THE.version, THE.permitted) doc_2_vec_file = 'cache/%s/%s/doc_2_vec.pkl' % (THE.version, THE.permitted) documents_file = 'cache/%s/%s/documents.pkl' % (THE.version, THE.permitted) lda_model_file = 'cache/%s/%s/lda_model.pkl' % (THE.version, THE.permitted) vocabulary_file = 'cache/%s/%s/vocabulary.pkl' % (THE.version, THE.permitted) if os.path.isfile(graph_file) and os.path.isfile(vectorizer_file) \ and os.path.isfile(doc_2_vec_file) and os.path.isfile(documents_file) \ and os.path.isfile(lda_model_file) and os.path.isfile(vocabulary_file): with open(graph_file) as f: graph = cPkl.load(f) miner = Miner(graph, permitted=THE.permitted, ignores=THE.IGNORE_VENUES[THE.version]) with open(vectorizer_file) as f: miner.vectorizer = cPkl.load(f) with open(doc_2_vec_file) as f: miner.doc_2_vec = joblib.load(f) with open(documents_file) as f: miner.documents = cPkl.load(f) with open(lda_model_file) as f: lda_model = cPkl.load(f) with open(vocabulary_file) as f: vocab = cPkl.load(f) else: miner, graph, lda_model, vocab = store_graph_lda_data() return miner, graph, lda_model, vocab
def get_graph_lda_data(iterations=ITERATIONS): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(N_TOPICS, n_iter=iterations, alpha=ALPHA, beta=BETA) return miner, graph, lda_model, vocab
def lda_topics(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.22359, beta=0.53915) # lda_model, vocab = miner.lda(11, n_iter=100, alpha=0.847433736937, beta=0.763774618977) n_top_words = 15 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def get_graph_lda_data(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph, THE.permitted, THE.IGNORE_VENUES[THE.version]) lda_model, vocab = miner.lda(get_n_topics(), n_iter=ITERATIONS, alpha=ALPHA, beta=BETA, stop_words=STOP_WORDS) return miner, graph, lda_model, vocab
def super_author(fig_prefix="super_author", top_percent=1.00): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) authors = graph.get_papers_by_authors() author_topics = {} tops = top_authors(graph, top_percent) for author_id, papers in authors.items(): if author_id not in tops: continue topics = [0] * lda_model.n_topics for paper_id, _, __ in papers: document = miner.documents[paper_id] for index, topic_count in enumerate(document.topics_count): if topic_count >= TOPIC_THRESHOLD: topics[index] = 1 author_topics[author_id] = sum(topics) vals = sorted(author_topics.values(), reverse=True) # x_axis = range(1, len(vals) + 1) # plt.ylabel("Topic Count") # plt.xlabel("Author ID") # plt.title("Super Author") # plt.ylim(min(vals)-1, max(vals)+1) # plt.plot(x_axis, vals) # plt.savefig("figs/super_author/%s.png"%fig_prefix) # plt.clf() fig = plt.figure(figsize=(8, 2), dpi=100) counter = Counter() for val in vals: counter[val] += 1 bar_x = [] bar_y = [] for key in sorted(counter.keys()): bar_x.append(key) bar_y.append(counter[key]) print(bar_x, bar_y) return fig, ax = plt.subplots() width = 2 / 3 ax.bar(bar_x, bar_y, 2 / 3, color='blue', align='center') ax.set_xticks(np.arange(1, lda_model.n_topics + 1)) ax.set_xticklabels(np.arange(1, lda_model.n_topics + 1)) # for i, v in zip(bar_x,bar_y): # ax.text(i, v + 0.25, str(v), color='red', fontweight='bold', fontsize=11, horizontalalignment='center') plt.xlabel("Topics") plt.ylabel("Authors Count") # plt.ylim(min(bar_y) - 1, max(bar_y) + 1) plt.savefig("figs/super_author/%s_bar.png" % fig_prefix) plt.clf() n_top_words = 10 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def topic_evolution(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) paper_nodes = graph.paper_nodes topics_map = {} n_topics = lda_model.n_topics for paper_id, paper in paper_nodes.items(): document = miner.documents[paper_id] year_topics = topics_map.get(paper.year, np.array([0]*n_topics)) topics_map[paper.year] = np.add(year_topics, document.topics_count) yt_map = {} for year, t_count in topics_map.items(): yt_map[year] = percent_sort(t_count) width = 0.8 plts = [] x_axis = np.arange(1, len(yt_map.keys()) + 1) # x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0] * len(yt_map.keys())) colors_dict = {} TOP_TOPIC_COUNT = 7 for index in range(TOP_TOPIC_COUNT): bar_val, color = [], [] for year in sorted(yt_map.keys(), key=lambda x: int(x)): topic = yt_map[year][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) plts.append(plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset)) y_offset = np.add(y_offset, bar_val) plt.ylabel("Topic %") plt.xlabel("Year") plt.xticks(x_axis + width/2, [str(y)[2:] for y in sorted(yt_map.keys(), key=lambda x: int(x))]) plt.yticks(np.arange(0, 101, 10)) plt.ylim([0, 101]) # Legends patches = [] squares = [] names = [] t_names = ["Testing", "Applications", "Program Analysis", "Tools and Projects", "Defect Analysis", "Modeling", "Maintenance"] for index, (topic, color) in enumerate(colors_dict.items()): print(topic) patches.append(mpatches.Patch(color=color, label='Topic %s' % str(topic))) squares.append(plts[index][0]) # names.append('Topic %s' % str(topic)) # names.append(t_names[index]) # plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=7, fontsize=9) plt.legend(tuple(patches), tuple(t_names), loc='upper center', bbox_to_anchor=(0.5, 1.14), ncol=4, fontsize=11, handlelength=0.7) plt.savefig("figs/topic_evolution/topic_evolution_7_gib.png") plt.clf() n_top_words = 10 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def super_author(fig_prefix="super_author" ,top_percent=1.00): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) authors = graph.get_papers_by_authors() author_topics = {} tops = top_authors(graph, top_percent) for author_id, papers in authors.items(): if author_id not in tops: continue topics = [0]*lda_model.n_topics for paper_id, _, __ in papers: document = miner.documents[paper_id] for index, topic_count in enumerate(document.topics_count): if topic_count >= TOPIC_THRESHOLD: topics[index] = 1 author_topics[author_id] = sum(topics) vals = sorted(author_topics.values(), reverse=True) # x_axis = range(1, len(vals) + 1) # plt.ylabel("Topic Count") # plt.xlabel("Author ID") # plt.title("Super Author") # plt.ylim(min(vals)-1, max(vals)+1) # plt.plot(x_axis, vals) # plt.savefig("figs/super_author/%s.png"%fig_prefix) # plt.clf() fig = plt.figure(figsize=(8, 2), dpi=100) counter = Counter() for val in vals: counter[val] += 1 bar_x = [] bar_y = [] for key in sorted(counter.keys()): bar_x.append(key) bar_y.append(counter[key]) print(bar_x, bar_y) return fig, ax = plt.subplots() width = 2/3 ax.bar(bar_x, bar_y, 2/3, color='blue', align='center') ax.set_xticks(np.arange(1,lda_model.n_topics+1)) ax.set_xticklabels(np.arange(1,lda_model.n_topics+1)) # for i, v in zip(bar_x,bar_y): # ax.text(i, v + 0.25, str(v), color='red', fontweight='bold', fontsize=11, horizontalalignment='center') plt.xlabel("Topics") plt.ylabel("Authors Count") # plt.ylim(min(bar_y) - 1, max(bar_y) + 1) plt.savefig("figs/super_author/%s_bar.png"%fig_prefix) plt.clf() n_top_words = 10 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def conference_evolution_2(paper_range, figname): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if tup[0] not in paper_range: continue for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics n_top_words = 10 #Heatmap heatmap_arr = [] column_labels = [] for conference_id, conf in zip( sorted(conference_heatmaps.keys(), key=lambda x: int(x)), mysql.get_conferences()): tot = sum(conference_heatmaps[conference_id]) if tot == 0: continue column_labels.append(conf.acronym) dist = [top / tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) fig, ax = plt.subplots() heatmap_arr = np.array(heatmap_arr) heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds) plt.ylabel("Conferences") plt.xlabel("Topics") row_labels = range(lda_model.n_topics) ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False) ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False) ax.set_xticklabels(row_labels, minor=False) ax.set_yticklabels(column_labels, minor=False) plt.savefig("figs/diversity/heatmap_7topics.png") plt.clf() for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words))) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png") make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/evolution/%s.png" % figname)
def pc_topics_heatmap(year_range=None): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) max_len = 21 start = 1993 p_conferences = graph.get_papers_by_venue() p_committees = graph.get_committee_by_conference() conference_topics = {} for conference in mysql.get_conferences(): year_committees = index_by_year(p_committees[conference.id]) year_papers = index_by_year(p_conferences[conference.id]) year_scores = {} topics = np.array([0] * lda_model.n_topics) for year in sorted(year_committees.keys(), key=lambda y: int(y)): if (year_range is not None) and (int(year) not in year_range): continue papers = year_papers.get(year, None) if papers is None: year_scores[int(year)] = None continue committee = year_committees[year] for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = set(paper.author_ids.strip().split(",")) if author_ids.intersection(committee): continue topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference.id] = topics heatmap_arr = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): tot = sum(conference_topics[conference_id]) dist = [top / tot for top in conference_topics[conference_id]] heatmap_arr.append(dist) row_labels = range(lda_model.n_topics) column_labels = [c.acronym for c in mysql.get_conferences()] make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/pc/pc_heatmap_7topics.png")
def conference_evolution_2(paper_range, figname): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if tup[0] not in paper_range: continue for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics n_top_words = 10 #Heatmap heatmap_arr = [] column_labels = [] for conference_id, conf in zip(sorted(conference_heatmaps.keys(), key=lambda x: int(x)), mysql.get_conferences()): tot = sum(conference_heatmaps[conference_id]) if tot == 0: continue column_labels.append(conf.acronym) dist = [top/tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) fig, ax = plt.subplots() heatmap_arr = np.array(heatmap_arr) heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds) plt.ylabel("Conferences") plt.xlabel("Topics") row_labels = range(lda_model.n_topics) ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False) ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False) ax.set_xticklabels(row_labels, minor=False) ax.set_yticklabels(column_labels, minor=False) plt.savefig("figs/diversity/heatmap_7topics.png") plt.clf() for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words))) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png") make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/evolution/%s.png"%figname)
def predict_venues(estimators, is_independent=IS_INDEPENDENT_VENUE, n_folds=5, n_topics=N_TOPICS, alpha=ALPHA, beta=BETA, n_iter=100, min_tfidf_score=0.1, tfidf_top=100, random_state=RANDOM_STATE): def make_key(pred, pre_proc): return "%s - %s" % (pred.__name__, pre_proc.__name__) graph = cite_graph(GRAPH_CSV) miner = Miner(graph) papers, groups = get_papers_and_groups(graph, is_independent=is_independent) metrics_map = { make_key(predictor, preprocessor): [] for predictor, preprocessor in estimators } for index, (train_x, train_y, test_x, test_y) in enumerate(split(papers, groups, n_folds=n_folds)): print("#### Iteration %d" % (index + 1)) # TSNE process_embeddings(index, train_x, test_x) # Count Vectorizer vectorizer = CountVectorizer(stop_words=STOP_WORDS, token_pattern=TOKEN_PATTERN) train_docs = [paper.raw for paper in train_x] test_docs = [paper.raw for paper in test_x] train_vectorized = vectorizer.fit_transform(train_docs).toarray() test_vectorized = vectorizer.transform(test_docs).toarray() # TFIDF tfidf_transformer = TfidfTransformer() tfidf_matrix = tfidf_transformer.fit_transform( train_vectorized).toarray() tfidf_matrix[tfidf_matrix < min_tfidf_score] = 0 tfidf_means = np.mean(tfidf_matrix, axis=0) tfidf_top_indices = np.argsort(tfidf_means)[::-1][:tfidf_top] # LDA-DE alpha = alpha if alpha else 50 / N_TOPICS beta = beta if beta else 0.01 lda_model = lda.LDA(n_topics=n_topics, alpha=alpha, eta=beta, n_iter=n_iter, random_state=random_state) train_transformed = lda_model.fit_transform(train_vectorized) # Putting it together for i, (vectorized, topics) in enumerate(zip(train_vectorized, train_transformed)): train_x[i].vectorized = vectorized[tfidf_top_indices] train_x[i].topics_count = topics sum_t = sum(topics) sum_t = sum_t if sum_t else 0.00001 train_x[i].topics_score = [np.float(t / sum_t) for t in topics] test_transformed = lda_model.transform(test_vectorized) for i, (vectorized, topics) in enumerate(zip(test_vectorized, test_transformed)): test_x[i].vectorized = vectorized[tfidf_top_indices] test_x[i].topics_count = topics sum_t = sum(topics) sum_t = sum_t if sum_t else 0.00001 test_x[i].topics_score = [t / sum_t for t in topics] for predictor, preprocessor in estimators: key = make_key(predictor, preprocessor) print(key) predicted, metrics = predictor(preprocessor, train_x, train_y, test_x, test_y) metrics_map[key].append(metrics) for predictor, preprocessor in estimators: key = make_key(predictor, preprocessor) print("### " + key) measures = Metrics.avg_score(metrics_map[key]) print(measures)
def get_graph_lda_data(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph, THE.permitted) lda_model, vocab = miner.lda(get_n_topics(), n_iter=ITERATIONS, alpha=ALPHA, beta=BETA, stop_words=STOP_WORDS) return miner, graph, lda_model, vocab
def conference_evolution(): legit_conferences = ["ICSE", "MSR", "FSE", "ASE"] non_legit_conferences = ["GPCE", "FASE"] TOP_TOPIC_COUNT = 7 graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() f, subplts = plt.subplots(3, 3) f.tight_layout() y_counter = -1 x_counter = 0 for conf_index, conference in enumerate(mysql.get_conferences()): # if conference.acronym not in legit_conferences: continue if conference.acronym in non_legit_conferences: continue y_counter += 1 if y_counter > 2: x_counter += 1 y_counter = 0 year_topics = {} year_heatmaps = {} for year, papers in yearize(conferences[conference.id]).items(): topics = np.array([0]*lda_model.n_topics) for paper_id in papers: topics = np.add(topics, miner.documents[paper_id].topics_count) year_heatmaps[year] = topics year_topics[year] = percent_sort(topics) width = 0.8 x_axis = np.arange(1, len(year_topics.keys()) + 1) # x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0] * len(year_topics.keys())) colors_dict={} for index in range(TOP_TOPIC_COUNT): bar_val, color = [], [] for year in sorted(year_topics.keys(), key=lambda x:int(x)): topic = year_topics[year][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) subplts[x_counter, y_counter].bar(x_axis, bar_val, width, color=color, bottom=y_offset) y_offset = np.add(y_offset, bar_val) # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %") #subplts[x_counter, y_counter].set_xlabel("Conferences") if len(year_topics.keys()) <= 14: subplts[x_counter, y_counter].set_xticks(x_axis + width / 2) subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for y in sorted(year_topics.keys(), key=lambda x: int(x))], fontsize=7) else: subplts[x_counter, y_counter].set_xticks(np.arange(1, len(year_topics.keys()) + 1, 2) + width / 2) subplts[x_counter, y_counter].set_xticklabels([str(y)[2:] for index, y in enumerate(sorted(year_topics.keys(), key=lambda x: int(x))) if index%2 == 0], fontsize=7) subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20)) subplts[x_counter, y_counter].set_ylim([0,101]) subplts[x_counter, y_counter].set_title(conference.acronym) # Legends patches = [] labels = [] for topic in xrange(lda_model.n_topics): patches.append(mpatches.Patch(color=get_color(topic))) labels.append('Topic %s' % str(topic)) f.legend(handles=patches, labels=labels, loc='upper center', bbox_to_anchor=(0.5, 0.04), ncol=12, fontsize=7) plt.savefig("figs/evolution/evolution_7topics.png") plt.clf() n_top_words = 10 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def topic_evolution(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) paper_nodes = graph.paper_nodes topics_map = {} n_topics = lda_model.n_topics for paper_id, paper in paper_nodes.items(): document = miner.documents[paper_id] year_topics = topics_map.get(paper.year, np.array([0] * n_topics)) topics_map[paper.year] = np.add(year_topics, document.topics_count) yt_map = {} for year, t_count in topics_map.items(): yt_map[year] = percent_sort(t_count) width = 0.8 plts = [] x_axis = np.arange(1, len(yt_map.keys()) + 1) # x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0] * len(yt_map.keys())) colors_dict = {} TOP_TOPIC_COUNT = 7 for index in range(TOP_TOPIC_COUNT): bar_val, color = [], [] for year in sorted(yt_map.keys(), key=lambda x: int(x)): topic = yt_map[year][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) plts.append( plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset)) y_offset = np.add(y_offset, bar_val) plt.ylabel("Topic %") plt.xlabel("Year") plt.xticks( x_axis + width / 2, [str(y)[2:] for y in sorted(yt_map.keys(), key=lambda x: int(x))]) plt.yticks(np.arange(0, 101, 10)) plt.ylim([0, 101]) # Legends patches = [] squares = [] names = [] t_names = [ "Testing", "Applications", "Program Analysis", "Tools and Projects", "Defect Analysis", "Modeling", "Maintenance" ] for index, (topic, color) in enumerate(colors_dict.items()): print(topic) patches.append( mpatches.Patch(color=color, label='Topic %s' % str(topic))) squares.append(plts[index][0]) # names.append('Topic %s' % str(topic)) # names.append(t_names[index]) # plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=7, fontsize=9) plt.legend(tuple(patches), tuple(t_names), loc='upper center', bbox_to_anchor=(0.5, 1.14), ncol=4, fontsize=11, handlelength=0.7) plt.savefig("figs/topic_evolution/topic_evolution_7_gib.png") plt.clf() n_top_words = 10 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def conference_evolution(): legit_conferences = ["ICSE", "MSR", "FSE", "ASE"] non_legit_conferences = ["GPCE", "FASE"] TOP_TOPIC_COUNT = 7 graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() f, subplts = plt.subplots(3, 3) f.tight_layout() y_counter = -1 x_counter = 0 for conf_index, conference in enumerate(mysql.get_conferences()): # if conference.acronym not in legit_conferences: continue if conference.acronym in non_legit_conferences: continue y_counter += 1 if y_counter > 2: x_counter += 1 y_counter = 0 year_topics = {} year_heatmaps = {} for year, papers in yearize(conferences[conference.id]).items(): topics = np.array([0] * lda_model.n_topics) for paper_id in papers: topics = np.add(topics, miner.documents[paper_id].topics_count) year_heatmaps[year] = topics year_topics[year] = percent_sort(topics) width = 0.8 x_axis = np.arange(1, len(year_topics.keys()) + 1) # x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0] * len(year_topics.keys())) colors_dict = {} for index in range(TOP_TOPIC_COUNT): bar_val, color = [], [] for year in sorted(year_topics.keys(), key=lambda x: int(x)): topic = year_topics[year][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) subplts[x_counter, y_counter].bar(x_axis, bar_val, width, color=color, bottom=y_offset) y_offset = np.add(y_offset, bar_val) # subplts[x_counter, y_counter].set_ylabel("Topic Coverage %") #subplts[x_counter, y_counter].set_xlabel("Conferences") if len(year_topics.keys()) <= 14: subplts[x_counter, y_counter].set_xticks(x_axis + width / 2) subplts[x_counter, y_counter].set_xticklabels([ str(y)[2:] for y in sorted(year_topics.keys(), key=lambda x: int(x)) ], fontsize=7) else: subplts[x_counter, y_counter].set_xticks( np.arange(1, len(year_topics.keys()) + 1, 2) + width / 2) subplts[x_counter, y_counter].set_xticklabels([ str(y)[2:] for index, y in enumerate( sorted(year_topics.keys(), key=lambda x: int(x))) if index % 2 == 0 ], fontsize=7) subplts[x_counter, y_counter].set_yticks(np.arange(0, 101, 20)) subplts[x_counter, y_counter].set_ylim([0, 101]) subplts[x_counter, y_counter].set_title(conference.acronym) # Legends patches = [] labels = [] for topic in xrange(lda_model.n_topics): patches.append(mpatches.Patch(color=get_color(topic))) labels.append('Topic %s' % str(topic)) f.legend(handles=patches, labels=labels, loc='upper center', bbox_to_anchor=(0.5, 0.04), ncol=12, fontsize=7) plt.savefig("figs/evolution/evolution_7topics.png") plt.clf() n_top_words = 10 for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words)))
def conference_diversity(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics #fig, ax = plt.subplots() bar_vals = [] colors = [] width = 0.75 plts = [] x_axis = np.arange(1, len(conference_topics.keys())+1) #x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0]*len(conference_topics.keys())) colors_dict = {} for index in range(7): bar_val = [] color = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): topic = conference_topics[conference_id][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) plts.append(plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset)) y_offset = np.add(y_offset, bar_val) plt.ylabel("Topic Coverage %") plt.xlabel("Conferences") plt.xticks(x_axis+width/2, [c.acronym for c in mysql.get_conferences()]) plt.yticks(np.arange(0, 101, 10)) plt.ylim([0,101]) #Legends patches = [] for topic, color in colors_dict.items(): patches.append(mpatches.Patch(color=color, label='Topic %s' % str(topic))) plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6, fontsize=7) plt.savefig("figs/diversity/conference_diversity_7topics.png") plt.clf() n_top_words = 10 #Heatmap heatmap_arr = [] for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)): tot = sum(conference_heatmaps[conference_id]) dist = [top/tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) fig, ax = plt.subplots() heatmap_arr = np.array(heatmap_arr) heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds) plt.ylabel("Conferences") plt.xlabel("Topics") # row_labels = range(lda_model.n_topics) t_names= ["Testing", "Applications", "Program Analysis", "Tools and Projects", "Defect Analysis", "Modeling", "Maintenance"] row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), t_names)] column_labels = [c.acronym for c in mysql.get_conferences()] ax.set_xticks(np.arange(heatmap_arr.shape[1])+0.5, minor=False) ax.set_yticks(np.arange(heatmap_arr.shape[0])+0.5, minor=False) ax.set_xticklabels(row_labels, minor=False) ax.set_yticklabels(column_labels, minor=False) plt.savefig("figs/diversity/heatmap_7topics.png") plt.clf() for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words))) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png") make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/heatmap2.png")
def conference_diversity(): graph = cite_graph(GRAPH_CSV) miner = Miner(graph) lda_model, vocab = miner.lda(7, n_iter=100, alpha=0.847433736937, beta=0.763774618977) conferences = graph.get_papers_by_venue() conference_topics = {} conference_heatmaps = {} for conference_id, papers in conferences.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) conference_topics[conference_id] = percent_sort(topics) conference_heatmaps[conference_id] = topics #fig, ax = plt.subplots() bar_vals = [] colors = [] width = 0.75 plts = [] x_axis = np.arange(1, len(conference_topics.keys()) + 1) #x_axis = [c.acronym for c in mysql.get_conferences()] y_offset = np.array([0] * len(conference_topics.keys())) colors_dict = {} for index in range(7): bar_val = [] color = [] for conference_id in sorted(conference_topics.keys(), key=lambda x: int(x)): topic = conference_topics[conference_id][index] colors_dict[topic[0]] = get_color(topic[0]) color.append(colors_dict[topic[0]]) bar_val.append(topic[1]) plts.append( plt.bar(x_axis, bar_val, width, color=color, bottom=y_offset)) y_offset = np.add(y_offset, bar_val) plt.ylabel("Topic Coverage %") plt.xlabel("Conferences") plt.xticks(x_axis + width / 2, [c.acronym for c in mysql.get_conferences()]) plt.yticks(np.arange(0, 101, 10)) plt.ylim([0, 101]) #Legends patches = [] for topic, color in colors_dict.items(): patches.append( mpatches.Patch(color=color, label='Topic %s' % str(topic))) plt.legend(handles=patches, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=6, fontsize=7) plt.savefig("figs/diversity/conference_diversity_7topics.png") plt.clf() n_top_words = 10 #Heatmap heatmap_arr = [] for conference_id in sorted(conference_heatmaps.keys(), key=lambda x: int(x)): tot = sum(conference_heatmaps[conference_id]) dist = [top / tot for top in conference_heatmaps[conference_id]] heatmap_arr.append(dist) fig, ax = plt.subplots() heatmap_arr = np.array(heatmap_arr) heatmap = ax.pcolor(heatmap_arr, cmap=plt.cm.Reds) plt.ylabel("Conferences") plt.xlabel("Topics") # row_labels = range(lda_model.n_topics) t_names = [ "Testing", "Applications", "Program Analysis", "Tools and Projects", "Defect Analysis", "Modeling", "Maintenance" ] row_labels = [ str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), t_names) ] column_labels = [c.acronym for c in mysql.get_conferences()] ax.set_xticks(np.arange(heatmap_arr.shape[1]) + 0.5, minor=False) ax.set_yticks(np.arange(heatmap_arr.shape[0]) + 0.5, minor=False) ax.set_xticklabels(row_labels, minor=False) ax.set_yticklabels(column_labels, minor=False) plt.savefig("figs/diversity/heatmap_7topics.png") plt.clf() for index, topic_dist in enumerate(lda_model.topic_word_): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(index, ', '.join(topic_words))) # make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/dend_heatmap_7topics.png") make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/diversity/heatmap2.png")