Beispiel #1
0
def identify():
  d = gender_guesser.detector.Detector(case_sensitive=False)
  authors = mysql.get_authors()
  counts = {}
  for a_id, node in authors.items():
    if node.name:
      f_name = get_first_name(node.name)
      g = from_db(f_name)
      if g is None:
        g = d.get_gender(f_name)
      counts[g] = counts.get(g, []) + [node]
      node.f_name = f_name
  author_genders = []
  for key, nodes in counts.items():
    if "female" in key:
      gender = "f"
    elif "male" in key:
      gender = "m"
    else:
      continue
    for node in nodes:
      author_genders.append((gender, int(node.id)))
  # flattened = counts['unknown'] + counts['andy']
  # nodes = []
  mysql.update_genders(author_genders)
Beispiel #2
0
def identify():
    d = gender_guesser.detector.Detector(case_sensitive=False)
    authors = mysql.get_authors()
    counts = {}
    for a_id, node in authors.items():
        if node.name:
            f_name = get_first_name(node.name)
            g = from_db(f_name)
            if g is None:
                g = d.get_gender(f_name)
            counts[g] = counts.get(g, []) + [node]
            node.f_name = f_name
    author_genders = []
    for key, nodes in counts.items():
        if "female" in key:
            gender = "f"
        elif "male" in key:
            gender = "m"
        else:
            continue
        for node in nodes:
            author_genders.append((gender, int(node.id)))
    # flattened = counts['unknown'] + counts['andy']
    # nodes = []
    mysql.update_genders(author_genders)
Beispiel #3
0
def get_author_genders():
  authors = mysqldb.get_authors()
  gender_map = {}
  for a_id, node in authors.items():
    if not node.gender:
      continue
    gender_map[a_id] = node.gender
  return gender_map
Beispiel #4
0
def get_authors_by_h_index(save_file, top_count=100):
  with open(save_file) as f:
    author_data = cPkl.load(f)
  authors = mysql.get_authors()
  h_index_results = sorted([(a_id, authors[a_id]['name'], score) for a_id, score in author_data.items()],
                           key=lambda x: x[2], reverse=True)
  for i, result in enumerate(h_index_results[:top_count]):
    print(i + 1, result[1], result[2])
Beispiel #5
0
def get_author_genders():
    authors = mysqldb.get_authors()
    gender_map = {}
    for a_id, node in authors.items():
        if not node.gender:
            continue
        gender_map[a_id] = node.gender
    return gender_map
Beispiel #6
0
def get_author_genders():
  authors = mysql.get_authors()
  gender_map = {}
  for a_id, node in authors.items():
    if not node.gender:
      gender_map[a_id] = "u"
    else:
      gender_map[a_id] = node.gender
  return gender_map
Beispiel #7
0
def print_top_author_names(file_name):
  naive_pr_file = "figs/%s/%s/authors/%s.pkl" % (THE.version, THE.permitted, "for_gender/naive_page_rank")
  cite_pr_file = "figs/%s/%s/authors/%s.pkl" % (THE.version, THE.permitted, "for_gender/cite_page_rank")
  publ_pr_file = "figs/%s/%s/authors/%s.pkl" % (THE.version, THE.permitted, "for_gender/publ_page_rank")
  authors = mysql.get_authors()
  pr_results = sorted([
                        authors[a_id]['name']
                        # (a_id, authors[a_id]['name'], score)
                        for a_id, score in open_pkl(file_name).items()],
                      key=lambda x: x[1], reverse=True)
  print(pr_results[:100])
Beispiel #8
0
def plot_damp_top_authors(folder, damps, top, min_year, plot_author_count=20, show_legend=True):
  graph = cite_graph(GRAPH_CSV)
  top_authors = most_cited_authors(graph, top, min_year)[:plot_author_count]
  author_nodes = mysql.get_authors()
  x_labels = [author_nodes[a[0]].name for a in top_authors]
  x_axis = range(1, plot_author_count + 1)
  top_author_ids = np.array([a[0] for a in top_authors])
  folder_path = "figs/%s/%s/authors/%s" % (THE.version, THE.permitted, folder)
  palette = np.array(sns.color_palette("hls", plot_author_count))
  legends = []
  # for i, f_name in enumerate(os.listdir(folder_path)):
  y_axes = []
  means = np.array([0.0] * plot_author_count)
  plt.figure(figsize=(8, 2))
  for i, _ in enumerate(damps):
    # file_name = "%s/%s" % (folder_path, name)
    file_name = "%s/page_rank_%0.2f.pkl" % (folder_path, damps[i])
    with open(file_name) as f:
      pr_scores = cPkl.load(f)
      y_axis = np.array([pr_scores[a] for a in top_author_ids])
      y_axes.append(y_axis)
      means += y_axis
  indices = np.argsort(means)[::-1]
  top_author_ids = top_author_ids[indices]
  # sns.set_style("whitegrid", {'axes.grid': False})
  sns.set_style("white")
  for i, y_axis in enumerate(y_axes):
    plt.plot(x_axis, y_axis[indices], c=palette[i])
    legends.append("%0.2f" % damps[i])
  if show_legend:
    plt.legend(legends, bbox_to_anchor=(-0.1, 1.15, 1.15, 0.2), loc="lower left",
               mode="expand", borderaxespad=0, ncol=10)
  fig_name = "figs/%s/%s/authors/damp_%s.png" % (THE.version, THE.permitted, folder)
  plt.ylabel("Page Rank Score", fontsize=14)
  plt.xlabel("Author ID", fontsize=14)
  plt.xticks(x_axis, top_author_ids, rotation='vertical')
  plt.title("Page Rank Score for top %d cited author with varying damping factors" % plot_author_count)
  plt.savefig(fig_name, bbox_inches='tight')
  plt.clf()
Beispiel #9
0
    def from_file(file_name, delimiter='$|$'):

        paper_nodes = {}
        author_edges = {}
        cite_edges = {}
        collaborator_edges = {}
        author_nodes = mysqldb.get_authors()
        ref_nodes = {}

        def add_collaborator_edges(authors):
            if len(authors) <= 1:
                return
            for i in range(len(authors)):
                for j in range(i + 1, len(authors)):
                    low, high = min(authors[i].id, authors[j].id), max(
                        authors[i].id, authors[j].id)
                    key = low + "-" + high
                    e = collaborator_edges.get(key, None)
                    if e is None:
                        e = Edge(source=low,
                                 target=high,
                                 edge_type="collaborator",
                                 count=1)
                    else:
                        e.count += 1
                    collaborator_edges[key] = e

        with open(file_name, 'rb') as f:
            column_names = f.readline().strip().lower().split(delimiter)
            for line in f.readlines():
                line = line.decode('utf-8', 'ignore').encode("utf-8")
                columns = line.strip().split(delimiter)
                paper_node = Node()
                for name, val in zip(column_names, columns):
                    paper_node[name] = val
                paper_node["type"] = "paper"
                if paper_node.ref_id:
                    ref_nodes[paper_node.ref_id] = paper_node
                paper_nodes[paper_node.id] = paper_node
                paper_authors = []
                for author_id, author in zip(
                        columns[AUTHOR_ID_INDEX].split(","),
                        columns[AUTHOR_NAME_INDEX].split(",")):
                    author_node = author_nodes[author_id]
                    paper_authors.append(author_node)
                    edge = Edge(source=author_node.id,
                                target=paper_node.id,
                                edge_type="author")
                    author_edges[edge.id] = edge
                add_collaborator_edges(paper_authors)

            cited_counts = {}
            for paper_id, paper in paper_nodes.items():
                if not paper.ref_id: continue
                references = paper.cites
                if not references: continue
                source = ref_nodes[paper.ref_id]
                for ref_id in references.split(","):
                    if not ref_nodes.get(ref_id, None): continue
                    target = ref_nodes[ref_id]
                    target_cited = cited_counts.get(target.id, 0)
                    cited_counts[target.id] = target_cited + 1
                    edge = Edge(source=source.id,
                                target=target.id,
                                edge_type="cites")
                    # edge = Edge(source=target.id, target=source.id, edge_type="cite")
                    cite_edges[edge.id] = edge
            for paper_id, paper in paper_nodes.items():
                paper["local_cites"] = cited_counts.get(paper_id, 0)
        graph = Graph()
        graph.paper_nodes = paper_nodes
        graph.author_nodes = author_nodes
        graph.author_edges = author_edges
        graph.cite_edges = cite_edges
        graph.collaborator_edges = collaborator_edges
        graph.add_pc_membership(mysqldb.get_pc_membership())
        return graph
Beispiel #10
0
  def from_file(file_name, delimiter='$|$'):

    paper_nodes = {}
    author_edges = {}
    cite_edges = {}
    collaborator_edges = {}
    author_nodes = mysqldb.get_authors()
    ref_nodes = {}

    def add_collaborator_edges(authors):
      if len(authors) <= 1:
        return
      for i in range(len(authors)):
        for j in range(i + 1, len(authors)):
          low, high = min(authors[i].id, authors[j].id), max(authors[i].id, authors[j].id)
          key = low + "-" + high
          e = collaborator_edges.get(key, None)
          if e is None:
            e = Edge(source=low, target=high, edge_type="collaborator", count=1)
          else:
            e.count += 1
          collaborator_edges[key] = e

    with open(file_name, 'rb') as f:
      column_names = f.readline().strip().lower().split(delimiter)
      for line in f.readlines():
        line = line.decode('utf-8', 'ignore').encode("utf-8")
        columns = line.strip().split(delimiter)
        paper_node = Node()
        for name, val in zip(column_names, columns):
          paper_node[name] = val
        paper_node["type"] = "paper"
        if paper_node.ref_id:
          ref_nodes[paper_node.ref_id] = paper_node
        paper_nodes[paper_node.id] = paper_node
        paper_authors = []
        for author_id, author in zip(columns[AUTHOR_ID_INDEX].split(","), columns[AUTHOR_NAME_INDEX].split(",")):
          author_node = author_nodes[author_id]
          paper_authors.append(author_node)
          edge = Edge(source=author_node.id, target=paper_node.id, edge_type="author")
          author_edges[edge.id] = edge
        add_collaborator_edges(paper_authors)

      cited_counts = {}
      for paper_id, paper in paper_nodes.items():
        if not paper.ref_id: continue
        references = paper.cites
        if not references: continue
        source = ref_nodes[paper.ref_id]
        for ref_id in references.split(","):
          if not ref_nodes.get(ref_id, None): continue
          target = ref_nodes[ref_id]
          target_cited = cited_counts.get(target.id, 0)
          cited_counts[target.id] = target_cited + 1
          edge = Edge(source=source.id, target=target.id, edge_type="cites")
          # edge = Edge(source=target.id, target=source.id, edge_type="cite")
          cite_edges[edge.id] = edge
      for paper_id, paper in paper_nodes.items():
        paper["local_cites"] = cited_counts.get(paper_id, 0)
    graph = Graph()
    graph.paper_nodes = paper_nodes
    graph.author_nodes = author_nodes
    graph.author_edges = author_edges
    graph.cite_edges = cite_edges
    graph.collaborator_edges = collaborator_edges
    graph.add_pc_membership(mysqldb.get_pc_membership())
    return graph