def run(self): paper_repdocs_file, author_file = self.input() with paper_repdocs_file.open() as pfile: paper_df = pd.read_csv(pfile, index_col=(0,)) paper_df.fillna('', inplace=True) # read out authorship records with author_file.open() as afile: author_df = pd.read_csv(afile, header=0, index_col=(0,)) # initialize repdoc dictionary from complete list of person ids author_ids = author_df.index.unique() repdocs = {i: [] for i in author_ids} # build up repdocs for each author for person_id, paper_id in author_df.itertuples(): # Check if doc exists if paper_id in paper_df.doc.index: doc = paper_df.loc[paper_id]['doc'] repdocs[person_id].append(doc) # save repdocs rows = ((person_id, '|'.join(docs)) for person_id, docs in repdocs.iteritems()) util.write_csv_to_fwrapper(self.output(), ('author_id', 'doc'), rows)
def run(self): graph_file, idmap_file, paper_file, author_file = self.input() # Read in dependencies lcc = igraph.Graph.Read_GraphMLz(graph_file.path) author_venue_df = self.build_linked_venue_frame() venue_map = self.assign_venue_ids(author_venue_df) records = util.iter_csv_fwrapper(idmap_file) lcc_idmap = {record[0]: int(record[1]) for record in records} # Use sets in order to ensure uniqueness. for v in lcc.vs: v['venues'] = set() # Add the venue IDs to the node venue sets. for rownum, (author_id, venue) in author_venue_df.iterrows(): node_id = lcc_idmap[str(author_id)] venue_id = venue_map[venue] lcc.vs[node_id]['venues'].add(venue_id) # Convert the sets to tuples. for v in lcc.vs: v['venues'] = tuple(v['venues']) # save a copy of the graph with venue info pickle_outfile, venue_map_outfile = self.output() lcc.write_picklez(pickle_outfile.path) # lcc-author-citation-graph rows = ((vnum, venue) for venue, vnum in venue_map.iteritems()) util.write_csv_to_fwrapper(venue_map_outfile, ('venue_id', 'venue_name'), rows)
def run(self): refg = igraph.Graph() nodes = self.read_paper_vertices() refg.add_vertices(nodes) # Build and save paper id to node id mapping idmap = {str(v['name']): v.index for v in refg.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper(self.idmap_output_file, ('paper_id', 'node_id'), rows) # Now add venues to nodes as paper attributes for paper_id, venue in self.read_paper_venues(): node_id = idmap[paper_id] refg.vs[node_id]['venue'] = venue # next add author ids for v in refg.vs: v['author_ids'] = [] for author_id, paper_id in util.iter_csv_fwrapper(self.author_file): node_id = idmap[paper_id] refg.vs[node_id]['author_ids'].append(author_id) # Finally add edges from citation records citation_links = self.read_paper_references(idmap) refg.add_edges(citation_links) # Save in both pickle and graphml formats refg.write_picklez(self.pickle_output_file.path) refg.write_graphmlz(self.graphml_output_file.path) return refg
def run(self): graph_file, idmap_file, paper_file, author_file = self.input() # Read in dependencies lcc = igraph.Graph.Read_GraphMLz(graph_file.path) author_venue_df = self.build_linked_venue_frame() venue_map = self.assign_venue_ids(author_venue_df) records = util.iter_csv_fwrapper(idmap_file) lcc_idmap = {record[0]: int(record[1]) for record in records} # Use sets in order to ensure uniqueness. for v in lcc.vs: v['venues'] = set() # Add the venue IDs to the node venue sets. for rownum, (author_id, venue) in author_venue_df.iterrows(): node_id = lcc_idmap[str(author_id)] venue_id = venue_map[venue] lcc.vs[node_id]['venues'].add(venue_id) # Convert the sets to tuples. for v in lcc.vs: v['venues'] = tuple(v['venues']) # save a copy of the graph with venue info pickle_outfile, venue_map_outfile = self.output() lcc.write_picklez(pickle_outfile.path) # lcc-author-citation-graph rows = ((vnum, venue) for venue, vnum in venue_map.iteritems()) util.write_csv_to_fwrapper( venue_map_outfile, ('venue_id', 'venue_name'), rows)
def run(self): refg = igraph.Graph() nodes = self.read_paper_vertices() refg.add_vertices(nodes) # Build and save paper id to node id mapping idmap = {str(v['name']): v.index for v in refg.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper( self.idmap_output_file, ('paper_id', 'node_id'), rows) # Now add venues to nodes as paper attributes for paper_id, venue in self.read_paper_venues(): node_id = idmap[paper_id] refg.vs[node_id]['venue'] = venue # next add author ids for v in refg.vs: v['author_ids'] = [] for author_id, paper_id in util.iter_csv_fwrapper(self.author_file): node_id = idmap[paper_id] refg.vs[node_id]['author_ids'].append(author_id) # Finally add edges from citation records citation_links = self.read_paper_references(idmap) refg.add_edges(citation_links) # Save in both pickle and graphml formats refg.write_picklez(self.pickle_output_file.path) refg.write_graphmlz(self.graphml_output_file.path) return refg
def run(self): """The repdoc for a single paper consists of its title and abstract, concatenated with space between. The paper records are read from a csv file and written out as (paper_id, repdoc) pairs. """ docs = self.read_paper_repdocs() rows = ((docid, doc.encode('utf-8')) for docid, doc in docs) util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)
def run(self): nodes = self.read_author_ids() edges = self.get_edges() authorg = util.build_undirected_graph(nodes, edges) # Now write the graph to gzipped graphml file. graph_output_file, idmap_output_file = self.output() authorg.write_graphmlz(graph_output_file.path) # Finally, build and save the ID map. idmap = {v['name']: v.index for v in authorg.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper(idmap_output_file, ('author_id', 'node_id'), rows)
def run(self): nodes = self.read_author_ids() edges = self.get_edges() authorg = util.build_undirected_graph(nodes, edges) # Now write the graph to gzipped graphml file. graph_output_file, idmap_output_file = self.output() authorg.write_graphmlz(graph_output_file.path) # Finally, build and save the ID map. idmap = {v['name']: v.index for v in authorg.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper( idmap_output_file, ('author_id', 'node_id'), rows)
def run(self): graphml_outfile, edgelist_outfile, idmap_outfile = self.output() author_graph_file, _ = self.input() # Read graph, find LCC, and save as graphml and edgelist authorg = igraph.Graph.Read_GraphMLz(author_graph_file.path) components = authorg.components() lcc = components.giant() lcc.write_graphmlz(graphml_outfile.path) lcc.write_edgelist(edgelist_outfile.path) # Build and save id map. idmap = {v['name']: v.index for v in lcc.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper(idmap_outfile, ('author_id', 'node_id'), rows)
def run(self): graphml_outfile, edgelist_outfile, idmap_outfile = self.output() author_graph_file, _ = self.input() # Read graph, find LCC, and save as graphml and edgelist authorg = igraph.Graph.Read_GraphMLz(author_graph_file.path) components = authorg.components() lcc = components.giant() lcc.write_graphmlz(graphml_outfile.path) lcc.write_edgelist(edgelist_outfile.path) # Build and save id map. idmap = {v['name']: v.index for v in lcc.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper( idmap_outfile, ('author_id', 'node_id'), rows)
def run(self): paper_repdocs_file, author_file = self.input() with paper_repdocs_file.open() as pfile: paper_df = pd.read_csv(pfile, index_col=(0,)) paper_df.fillna('', inplace=True) # read out authorship records with author_file.open() as afile: author_df = pd.read_csv(afile, header=0, index_col=(0,)) # initialize repdoc dictionary from complete list of person ids author_ids = author_df.index.unique() repdocs = {i: [] for i in author_ids} # build up repdocs for each author for person_id, paper_id in author_df.itertuples(): doc = paper_df.loc[paper_id]['doc'] repdocs[person_id].append(doc) # save repdocs rows = ((person_id, '|'.join(docs)) for person_id, docs in repdocs.iteritems()) util.write_csv_to_fwrapper(self.output(), ('author_id', 'doc'), rows)
def run(self): authorships = self.iter_authorships() util.write_csv_to_fwrapper(self.output(), ('author_id', 'paper_id'), authorships)
def run(self): authorships = self.iter_authorships() util.write_csv_to_fwrapper( self.output(), ('author_id', 'paper_id'), authorships)
def run(self): author_rows = self.read_author_id_name_pairs() util.write_csv_to_fwrapper(self.output(), ('id', 'name'), author_rows)
def run(self): repdocs = util.iter_csv_fwrapper(self.input()) docs = ((docid, doc.decode('utf-8')) for docid, doc in repdocs) vecs = ((docid, doctovec.vectorize(doc)) for docid, doc in docs) rows = ((docid, '|'.join(doc).encode('utf-8')) for docid, doc in vecs) util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)