def print_gender_topics(file_name): stat_file = "figs/%s/%s/gender/%s.pkl" % (THE.version, THE.permitted, file_name) with open(stat_file) as f: stat_map = cPkl.load(f) # print(stat_map.keys()) # for venue_id in sorted(stat_map.keys(), lambda k: int(k)): venues = mysqldb.get_venues() venue_names = { shorter_names(venues[v_id].acronym): v_id for v_id in sorted(venues.keys(), key=lambda k: int(k)) } with open("figs/%s/%s/stats/heatmap_data.pkl" % (THE.version, THE.permitted)) as f: axis_data = cPkl.load(f) permitted_topics = axis_data['rows'] permitted_venues = axis_data['columns'] valid_data = axis_data['data'] print("," + ",".join(permitted_venues)) topics = get_topics() for topic_name in permitted_topics: topic_id = topics.index(topic_name) arr = [topic_name] for venue in permitted_venues: venue_id = venue_names[venue] if venue_id in stat_map and valid_data[venue][topic_name] > 0: m_v_f = 'T' if stat_map[venue_id][topic_id][2] else 'F' else: m_v_f = '' # print("Venue: %s, Topic: %d, Stat: %s" % (venue_id, topic, m_v_f)) arr.append(m_v_f) print(",".join(arr))
def print_gender_topics(file_name): stat_file = "figs/%s/%s/gender/%s.pkl" % (THE.version, THE.permitted, file_name) with open(stat_file) as f: stat_map = cPkl.load(f) # print(stat_map.keys()) # for venue_id in sorted(stat_map.keys(), lambda k: int(k)): venues = mysqldb.get_venues() venue_names = {shorter_names(venues[v_id].acronym): v_id for v_id in sorted(venues.keys(), key=lambda k: int(k))} with open("figs/%s/%s/stats/heatmap_data.pkl" % (THE.version, THE.permitted)) as f: axis_data = cPkl.load(f) permitted_topics = axis_data['rows'] permitted_venues = axis_data['columns'] valid_data = axis_data['data'] print("," + ",".join(permitted_venues)) topics = get_topics() for topic_name in permitted_topics: topic_id = topics.index(topic_name) arr = [topic_name] for venue in permitted_venues: venue_id = venue_names[venue] if venue_id in stat_map and valid_data[venue][topic_name] > 0: m_v_f = 'T' if stat_map[venue_id][topic_id][2] else 'F' else: m_v_f = '' # print("Venue: %s, Topic: %d, Stat: %s" % (venue_id, topic, m_v_f)) arr.append(m_v_f) print(",".join(arr))
def venue_dist_similarity(save_file): def cosine(a, b): return round(1 - spatial.distance.cosine(a, b), 3) def eucledian(a, b): return round(spatial.distance.euclidean(a, b), 3) with open(save_file, 'rb') as f: content = f.read() full = None conf = None jour = None venues = {} all_venues = mysqldb.get_venues() for row in content.split("\n"): cells = row.split(", ") if cells[0] == "Venue": continue if cells[0] == "all": full = map(float, cells[1:]) elif cells[0] == "conference": conf = map(float, cells[1:]) elif cells[0] == "journal": jour = map(float, cells[1:]) else: key = cells[0] venues[key] = map(float, cells[1:]) print("Venue", "Cosine+", "Eucledian-", "Loc-Cosine", "Loc-Eucledian") print("%s, %0.3f, %0.3f, , " % ("Conference", cosine(full, conf), eucledian(full, conf))) print("%s, %0.3f, %0.3f, , " % ("Journal", cosine(full, jour), eucledian(full, jour))) for key, val in venues.items(): loc = conf if all_venues[key].is_conference else jour print("%s, %0.3f, %0.3f, %0.3f, %0.3f" % (shorter_names(all_venues[key].acronym), cosine(full, val), eucledian(full, val), cosine(loc, val), eucledian(loc, val)))
def diversity(fig_name, paper_range=None, min_diversity_score=MIN_DIVERSITY_SCORE, save_labels=False): """ Heat map in paper :param fig_name: :param paper_range: :param min_diversity_score: :param save_labels: :return: """ if paper_range: print("DIVERSITY for %s between %d - %d" % (THE.permitted, paper_range[0], paper_range[-1])) else: print("DIVERSITY for %s" % THE.permitted) miner, graph, lda_model, vocab = retrieve_graph_lda_data() paper_map = graph.get_papers_by_venue(THE.permitted) venue_heatmaps = {} venues = mysqldb.get_venues() for conference_id, papers in paper_map.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if paper_range and tup[0] not in paper_range: continue for paper_id in tup[1]: document = miner.documents.get(paper_id, None) if document is not None: topics = np.add(topics, miner.documents[paper_id].topics_count) if sum(topics) > 0: venue_heatmaps[conference_id] = topics # row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), get_topics())] row_labels = np.array( [name for ind, name in zip(range(lda_model.n_topics), get_topics())]) # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)] column_labels = np.array([ shorter_names(v.acronym) for c, v in venues.items() if v.id in venue_heatmaps ]) # Heatmap heatmap_arr = [] for conference_id in sorted(venue_heatmaps.keys(), key=lambda x: int(x)): tot = sum(venue_heatmaps[conference_id]) dist = [top / tot for top in venue_heatmaps[conference_id]] heatmap_arr.append(dist) heatmap_arr = np.transpose(heatmap_arr) heatmap_arr[heatmap_arr < min_diversity_score] = 0.0 row_sums = np.sum(heatmap_arr, axis=1) heatmap_arr = heatmap_arr[np.where(row_sums > 0)] row_labels = row_labels[np.where(row_sums > 0)] mkdir("figs/%s/%s/diversity" % (THE.version, THE.permitted)) make_heatmap( heatmap_arr, row_labels, column_labels, "figs/%s/%s/diversity/%s.png" % (THE.version, THE.permitted, fig_name), paper_range) make_dendo_heatmap( heatmap_arr, row_labels, column_labels, "figs/%s/%s/diversity/%s_dend.png" % (THE.version, THE.permitted, fig_name), paper_range, save_labels)
def yearly_gender_topics(paper_range=None, file_name="yearly_topic_contribution"): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm def normalize(arr): arr_tot = sum(arr) if arr_tot == 0: return [0] * len(arr) return [arr_i / arr_tot for arr_i in arr] miner, graph, lda_model, vocab = get_graph_lda_data() p_venues = graph.get_papers_by_venue(permitted=THE.permitted) author_gender_map = get_author_genders() venue_topics = OrderedDict() for venue_id in sorted(mysqldb.get_venues().keys()): venue_id = str(venue_id) year_papers = index_by_year(p_venues[venue_id]) year_topics = OrderedDict() for year in sorted(year_papers.keys(), key=lambda y: int(y)): both_genders_topics = [] male_topics = [] female_topics = [] if (paper_range is not None) and (int(year) not in paper_range): continue papers = year_papers.get(year, []) if len(papers) > 0: for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = paper.author_ids.strip().split(",") paper_topics = miner.documents[paper_id].topics_count # unit_paper_topics = [t / len(author_ids) for t in paper_topics] male_count, female_count = 0, 0 for author_id in author_ids: gender = author_gender_map.get(author_id, None) if gender == 'm': male_count += 1 elif gender == 'f': female_count += 1 normalized_topics = normalize(paper_topics) if sum(normalized_topics) > 0: both_genders_topics.append(normalized_topics) if male_count > 0: male_topics.append(normalized_topics) if female_count > 0: female_topics.append(normalized_topics) year_topics[year] = { "all": both_genders_topics, "male": male_topics, "female": female_topics } venue_topics[venue_id] = year_topics save_file = "figs/%s/%s/gender/%s.pkl" % (THE.version, THE.permitted, file_name) with open(save_file, "wb") as f: cPkl.dump(venue_topics, f, cPkl.HIGHEST_PROTOCOL)
def get_paper_nodes(self, permitted='conferences'): paper_nodes = {} venues = mysqldb.get_venues() for p_id, paper in self.paper_nodes.items(): venue = venues[paper.venue] if venue.is_conference and permitted == 'journals': continue if not venue.is_conference and permitted == 'conferences': continue paper_nodes[p_id] = paper return paper_nodes
def diversity(fig_name, paper_range=None, min_diversity_score=MIN_DIVERSITY_SCORE, save_labels=False): """ Heat map in paper :param fig_name: :param paper_range: :param min_diversity_score: :param save_labels: :return: """ if paper_range: print("DIVERSITY for %s between %d - %d" % (THE.permitted, paper_range[0], paper_range[-1])) else: print("DIVERSITY for %s" % THE.permitted) miner, graph, lda_model, vocab = retrieve_graph_lda_data() paper_map = graph.get_papers_by_venue(THE.permitted) venue_heatmaps = {} venues = mysqldb.get_venues() for conference_id, papers in paper_map.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if paper_range and tup[0] not in paper_range: continue for paper_id in tup[1]: document = miner.documents.get(paper_id, None) if document is not None: topics = np.add(topics, miner.documents[paper_id].topics_count) if sum(topics) > 0: venue_heatmaps[conference_id] = topics # row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), get_topics())] row_labels = np.array([name for ind, name in zip(range(lda_model.n_topics), get_topics())]) # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)] column_labels = np.array([shorter_names(v.acronym) for c, v in venues.items() if v.id in venue_heatmaps]) # Heatmap heatmap_arr = [] for conference_id in sorted(venue_heatmaps.keys(), key=lambda x: int(x)): tot = sum(venue_heatmaps[conference_id]) dist = [top / tot for top in venue_heatmaps[conference_id]] heatmap_arr.append(dist) heatmap_arr = np.transpose(heatmap_arr) heatmap_arr[heatmap_arr < min_diversity_score] = 0.0 row_sums = np.sum(heatmap_arr, axis=1) heatmap_arr = heatmap_arr[np.where(row_sums > 0)] row_labels = row_labels[np.where(row_sums > 0)] mkdir("figs/%s/%s/diversity" % (THE.version, THE.permitted)) make_heatmap(heatmap_arr, row_labels, column_labels, "figs/%s/%s/diversity/%s.png" % (THE.version, THE.permitted, fig_name), paper_range) make_dendo_heatmap(heatmap_arr, row_labels, column_labels, "figs/%s/%s/diversity/%s_dend.png" % (THE.version, THE.permitted, fig_name), paper_range, save_labels)
def venue_distributions(save_file, venues, paper_range=None, save_labels=False): if paper_range: print("DIVERSITY for %s in venues %s between %d - %d" % (THE.permitted, venues, paper_range[0], paper_range[-1])) else: print("DIVERSITY for %s in venues %s" % (THE.permitted, venues)) miner, graph, lda_model, vocab = retrieve_graph_lda_data() paper_map = graph.get_papers_by_venue(THE.permitted) all_venues = mysqldb.get_venues() venue_dists = {str(venue): None for venue in venues} dataset_dist = np.array([0] * lda_model.n_topics) conf_dist = np.array([0] * lda_model.n_topics) jour_dist = np.array([0] * lda_model.n_topics) for venue_id, papers in paper_map.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if paper_range and tup[0] not in paper_range: continue for paper_id in tup[1]: document = miner.documents.get(paper_id, None) if document is None: continue topics = np.add(topics, miner.documents[paper_id].topics_count) if sum(topics) > 0: if venue_id in venue_dists: venue_dists[venue_id] = topics if all_venues[venue_id].is_conference: conf_dist = np.add(conf_dist, topics) else: jour_dist = np.add(jour_dist, topics) dataset_dist = np.add(dataset_dist, topics) venue_dists["all"] = dataset_dist venue_dists["conference"] = conf_dist venue_dists["journal"] = jour_dist header = ["Venue"] + get_topics() rows = [", ".join(header)] for venue_id, vals in venue_dists.items(): tot = sum(vals) vals = [round(top / tot, 3) for top in vals] rows.append(", ".join(map(str, [venue_id] + vals))) if save_labels: with open(save_file, 'wb') as f: f.write("\n".join(rows))
def get_documents(self): if self.documents: return self.documents paper_nodes = self.graph.paper_nodes documents = {} venues = mysqldb.get_venues() for paper_id, paper in paper_nodes.items(): venue = venues[paper.venue] acronym = shorter_name(venue.acronym) if acronym in self.ignores: continue if venue.is_conference and self.permitted == 'journals': continue if not venue.is_conference and self.permitted == 'conferences': continue if paper.abstract is not None and paper.abstract != 'None': raw = paper.abstract else: raw = paper.title documents[paper_id] = Document(raw) self.documents = documents return documents
def get_papers_by_venue(self, permitted='conferences'): """ :param permitted: ['conferences', 'journals', 'all'] :return: { <conference_id>: [(<paper>, <year>), ...] } """ papers = {} venues = mysqldb.get_venues() for paper_id, paper in self.paper_nodes.items(): venue = venues[paper.venue] if venue.is_conference and permitted == 'journals': continue if not venue.is_conference and permitted == 'conferences': continue c_papers = papers.get(paper.venue, []) c_papers.append((paper.id, paper.year)) papers[paper.venue] = c_papers for conference_id, c_papers in papers.items(): papers[conference_id] = sorted(c_papers, key=lambda tup: tup[1]) return papers
def get_papers_by_authors(self, permitted='conferences'): """ :return:{ <author_id> : [(<paper>, <year>, <conference>) , ...] } """ papers = {} venues = mysqldb.get_venues() for edge in self.author_edges.values(): author_papers = papers.get(edge.source, []) paper = self.paper_nodes[edge.target] venue = venues[paper.venue] if venue.is_conference and permitted == 'journals': continue if not venue.is_conference and permitted == 'conferences': continue author_papers.append((paper.id, paper.year, paper.venue)) papers[edge.source] = author_papers for author_id, a_papers in papers.items(): papers[author_id] = sorted(a_papers, key=lambda tup: (int(tup[2]), tup[1])) return papers
def venue_distributions(save_file, venues, paper_range=None, save_labels=False): if paper_range: print("DIVERSITY for %s in venues %s between %d - %d" % (THE.permitted, venues, paper_range[0], paper_range[-1])) else: print("DIVERSITY for %s in venues %s" % (THE.permitted, venues)) miner, graph, lda_model, vocab = retrieve_graph_lda_data() paper_map = graph.get_papers_by_venue(THE.permitted) all_venues = mysqldb.get_venues() venue_dists = {str(venue): None for venue in venues} dataset_dist = np.array([0] * lda_model.n_topics) conf_dist = np.array([0]*lda_model.n_topics) jour_dist = np.array([0]*lda_model.n_topics) for venue_id, papers in paper_map.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if paper_range and tup[0] not in paper_range: continue for paper_id in tup[1]: document = miner.documents.get(paper_id, None) if document is None: continue topics = np.add(topics, miner.documents[paper_id].topics_count) if sum(topics) > 0: if venue_id in venue_dists: venue_dists[venue_id] = topics if all_venues[venue_id].is_conference: conf_dist = np.add(conf_dist, topics) else: jour_dist = np.add(jour_dist, topics) dataset_dist = np.add(dataset_dist, topics) venue_dists["all"] = dataset_dist venue_dists["conference"] = conf_dist venue_dists["journal"] = jour_dist header = ["Venue"] + get_topics() rows = [", ".join(header)] for venue_id, vals in venue_dists.items(): tot = sum(vals) vals = [round(top / tot, 3) for top in vals] rows.append(", ".join(map(str, [venue_id] + vals))) if save_labels: with open(save_file, 'wb') as f: f.write("\n".join(rows))
def diversity(fig_name, paper_range=None): if paper_range: print("DIVERSITY for %s between %d - %d" % (THE.permitted, paper_range[0], paper_range[-1])) else: print("DIVERSITY for %s" % THE.permitted) miner, graph, lda_model, vocab = retrieve_graph_lda_data() paper_map = graph.get_papers_by_venue(THE.permitted) venue_topics = {} venue_heatmaps = {} valid_conferences = [] venues = mysqldb.get_venues() for conference_id, papers in paper_map.items(): topics = np.array([0] * lda_model.n_topics) for tup in yearize(papers).items(): if paper_range and tup[0] not in paper_range: continue for paper_id in tup[1]: topics = np.add(topics, miner.documents[paper_id].topics_count) if sum(topics) > 0: venue_topics[conference_id] = percent_sort(topics) venue_heatmaps[conference_id] = topics valid_conferences.append(conference_id) # row_labels = [str(ind) + "-" + name for ind, name in zip(range(lda_model.n_topics), get_topics())] row_labels = [name for ind, name in zip(range(lda_model.n_topics), get_topics())] # row_labels = ["%2d" % ind for ind in range(lda_model.n_topics)] column_labels = [shorter_names(venue.acronym) for c, venue in venues.items() if venue.id in valid_conferences] # Heatmap heatmap_arr = [] for conference_id in sorted(venue_heatmaps.keys(), key=lambda x: int(x)): tot = sum(venue_heatmaps[conference_id]) dist = [top / tot for top in venue_heatmaps[conference_id]] heatmap_arr.append(dist) # report(lda_model, vocab, n_top_words=15) make_dendo_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/v3/%s/diversity/%s_dend.png" % (THE.permitted, fig_name), paper_range) make_heatmap(np.transpose(heatmap_arr), row_labels, column_labels, "figs/v3/%s/diversity/%s.png" % (THE.permitted, fig_name), paper_range)
def gender_topics(paper_range=None, file_name="topic_contribution"): def index_by_year(tups): y_comm = {} for tup in tups: comm = y_comm.get(tup[1], set()) comm.add(tup[0]) y_comm[tup[1]] = comm return y_comm def normalize(arr): arr_tot = sum(arr) if arr_tot == 0: return [0] * len(arr) return [arr_i / arr_tot for arr_i in arr] miner, graph, lda_model, vocab = get_graph_lda_data() p_venues = graph.get_papers_by_venue(permitted=THE.permitted) author_gender_map = get_author_genders() venue_topics = OrderedDict() for venue_id in sorted(mysqldb.get_venues().keys()): venue_id = str(venue_id) year_papers = index_by_year(p_venues[venue_id]) both_genders_topics = [] male_topics = [] female_topics = [] for year in sorted(year_papers.keys(), key=lambda y: int(y)): if (paper_range is not None) and (int(year) not in paper_range): continue papers = year_papers.get(year, []) if len(papers) > 0: for paper_id in papers: paper = graph.paper_nodes[paper_id] author_ids = paper.author_ids.strip().split(",") paper_topics = miner.documents[paper_id].topics_count # unit_paper_topics = [t / len(author_ids) for t in paper_topics] male_count, female_count = 0, 0 for author_id in author_ids: gender = author_gender_map.get(author_id, None) if gender == 'm': male_count += 1 elif gender == 'f': female_count += 1 normalized_topics = normalize(paper_topics) if sum(normalized_topics) > 0: both_genders_topics.append(normalized_topics) if male_count > 0: male_topics.append(normalized_topics) if female_count > 0: female_topics.append(normalized_topics) # male_paper_topics = [male_count * t for t in unit_paper_topics] # female_paper_topics = [female_count * t for t in unit_paper_topics] # both_genders_topics.append(normalize(unit_paper_topics)) # male_topics.append(normalize(male_paper_topics)) # female_topics.append(normalize(female_paper_topics)) venue_topics[venue_id] = { "all": both_genders_topics, "male": male_topics, "female": female_topics } save_file = "figs/%s/%s/gender/%s.pkl" % (THE.version, THE.permitted, file_name) with open(save_file, "wb") as f: cPkl.dump(venue_topics, f, cPkl.HIGHEST_PROTOCOL)