def get_edges(self): """Return all edges from a file in which each line contains an (author, paper) pair.""" records = util.iter_csv_fwrapper(self.paper_idmap_file) idmap = {record[0]: int(record[1]) for record in records} refg = igraph.Graph.Read_Picklez(self.paper_graph_file.open()) records = util.iter_csv_fwrapper(self.author_file) rows = ((refg, idmap[paper_id], author_id) for author_id, paper_id in records) while True: edges = self.get_paper_edges(*rows.next()) for edge in edges: yield edge
def run(self): dict_file, vecs_file = self.input() dictionary = gensim.corpora.Dictionary.load(dict_file.path) records = util.iter_csv_fwrapper(vecs_file) repdoc_corpus = (doc.decode('utf-8').split('|') for _, doc in records) bow_corpus = (dictionary.doc2bow(doc) for doc in repdoc_corpus) gensim.corpora.MmCorpus.serialize(self.output().path, bow_corpus)
def run(self): refg = igraph.Graph() nodes = self.read_paper_vertices() refg.add_vertices(nodes) # Build and save paper id to node id mapping idmap = {str(v['name']): v.index for v in refg.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper( self.idmap_output_file, ('paper_id', 'node_id'), rows) # Now add venues to nodes as paper attributes for paper_id, venue in self.read_paper_venues(): node_id = idmap[paper_id] refg.vs[node_id]['venue'] = venue # next add author ids for v in refg.vs: v['author_ids'] = [] for author_id, paper_id in util.iter_csv_fwrapper(self.author_file): node_id = idmap[paper_id] refg.vs[node_id]['author_ids'].append(author_id) # Finally add edges from citation records citation_links = self.read_paper_references(idmap) refg.add_edges(citation_links) # Save in both pickle and graphml formats refg.write_picklez(self.pickle_output_file.path) refg.write_graphmlz(self.graphml_output_file.path) return refg
def run(self): refg = igraph.Graph() nodes = self.read_paper_vertices() refg.add_vertices(nodes) # Build and save paper id to node id mapping idmap = {str(v['name']): v.index for v in refg.vs} rows = sorted(idmap.items()) util.write_csv_to_fwrapper(self.idmap_output_file, ('paper_id', 'node_id'), rows) # Now add venues to nodes as paper attributes for paper_id, venue in self.read_paper_venues(): node_id = idmap[paper_id] refg.vs[node_id]['venue'] = venue # next add author ids for v in refg.vs: v['author_ids'] = [] for author_id, paper_id in util.iter_csv_fwrapper(self.author_file): node_id = idmap[paper_id] refg.vs[node_id]['author_ids'].append(author_id) # Finally add edges from citation records citation_links = self.read_paper_references(idmap) refg.add_edges(citation_links) # Save in both pickle and graphml formats refg.write_picklez(self.pickle_output_file.path) refg.write_graphmlz(self.graphml_output_file.path) return refg
def read_paper_references(self, idmap): """Filter out references to papers outside dataset.""" for paper_id, ref_id in util.iter_csv_fwrapper(self.refs_file): try: yield (idmap[paper_id], idmap[ref_id]) except: pass
def run(self): lcc_pickle_file, venue_map_file = self.input() # Read in the LCC graph lcc = igraph.Graph.Read_Picklez(lcc_pickle_file.path) # Build the community mapping: # each venue id is mapped to one or more node ids (the community) records = util.iter_csv_fwrapper(venue_map_file) communities = {int(venue_id): [] for venue_id, _ in records} for v in lcc.vs: for venue_id in v['venues']: communities[venue_id].append(v.index) # retrieve output files by_venue_file, by_author_file = self.output() # save ground truth communities comms = sorted(communities.items()) rows = (' '.join(map(str, comm)) for comm_num, comm in comms) with by_venue_file.open('w') as f: f.write('\n'.join(rows)) # save venue info for each author separately records = sorted([(v.index, v['venues']) for v in lcc.vs]) rows = (' '.join(map(str, venues)) for node_id, venues in records) with by_author_file.open('w') as f: f.write('\n'.join(rows))
def run(self): graph_file, idmap_file, paper_file, author_file = self.input() # Read in dependencies lcc = igraph.Graph.Read_GraphMLz(graph_file.path) author_venue_df = self.build_linked_venue_frame() venue_map = self.assign_venue_ids(author_venue_df) records = util.iter_csv_fwrapper(idmap_file) lcc_idmap = {record[0]: int(record[1]) for record in records} # Use sets in order to ensure uniqueness. for v in lcc.vs: v['venues'] = set() # Add the venue IDs to the node venue sets. for rownum, (author_id, venue) in author_venue_df.iterrows(): node_id = lcc_idmap[str(author_id)] venue_id = venue_map[venue] lcc.vs[node_id]['venues'].add(venue_id) # Convert the sets to tuples. for v in lcc.vs: v['venues'] = tuple(v['venues']) # save a copy of the graph with venue info pickle_outfile, venue_map_outfile = self.output() lcc.write_picklez(pickle_outfile.path) # lcc-author-citation-graph rows = ((vnum, venue) for venue, vnum in venue_map.iteritems()) util.write_csv_to_fwrapper( venue_map_outfile, ('venue_id', 'venue_name'), rows)
def run(self): graph_file, idmap_file, paper_file, author_file = self.input() # Read in dependencies lcc = igraph.Graph.Read_GraphMLz(graph_file.path) author_venue_df = self.build_linked_venue_frame() venue_map = self.assign_venue_ids(author_venue_df) records = util.iter_csv_fwrapper(idmap_file) lcc_idmap = {record[0]: int(record[1]) for record in records} # Use sets in order to ensure uniqueness. for v in lcc.vs: v['venues'] = set() # Add the venue IDs to the node venue sets. for rownum, (author_id, venue) in author_venue_df.iterrows(): node_id = lcc_idmap[str(author_id)] venue_id = venue_map[venue] lcc.vs[node_id]['venues'].add(venue_id) # Convert the sets to tuples. for v in lcc.vs: v['venues'] = tuple(v['venues']) # save a copy of the graph with venue info pickle_outfile, venue_map_outfile = self.output() lcc.write_picklez(pickle_outfile.path) # lcc-author-citation-graph rows = ((vnum, venue) for venue, vnum in venue_map.iteritems()) util.write_csv_to_fwrapper(venue_map_outfile, ('venue_id', 'venue_name'), rows)
def read_lcc_author_repdocs(self): """Read and return an iterator over the author repdoc corpus, which excludes the authors not in the LCC. """ author_repdoc_file, _, lcc_idmap_file = self.input() with lcc_idmap_file.open() as lcc_idmap_f: lcc_author_df = pd.read_csv(lcc_idmap_f, header=0, usecols=(0,)) lcc_author_ids = lcc_author_df['author_id'].values csv.field_size_limit(sys.maxint) records = util.iter_csv_fwrapper(author_repdoc_file) return (doc.split('|') for author_id, doc in records if int(author_id) in lcc_author_ids)
def read_paper_repdocs(self): paper_file = self.input() for record in util.iter_csv_fwrapper(paper_file): repdoc = '%s %s' % (record[1], record[4]) yield (record[0], repdoc.decode('utf-8'))
def read_paper_venues(self): """Iterate through (paper_id, venue) pairs from the paper csv file.""" for record in util.iter_csv_fwrapper(self.papers_file): yield (record[0], record[2])
def run(self): repdocs = util.iter_csv_fwrapper(self.input()) docs = ((docid, doc.decode('utf-8')) for docid, doc in repdocs) vecs = ((docid, doctovec.vectorize(doc)) for docid, doc in docs) rows = ((docid, '|'.join(doc).encode('utf-8')) for docid, doc in vecs) util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)
def read_repdocs(self): records = util.iter_csv_fwrapper(self.input()) return (doc.split('|') for _, doc in records)