Ejemplo n.º 1
0
    def run(self):
        paper_repdocs_file, author_file = self.input()
        
        with paper_repdocs_file.open() as pfile:
            paper_df = pd.read_csv(pfile, index_col=(0,))
            paper_df.fillna('', inplace=True)

        # read out authorship records
        with author_file.open() as afile:
            author_df = pd.read_csv(afile, header=0, index_col=(0,))

        # initialize repdoc dictionary from complete list of person ids
        author_ids = author_df.index.unique()
        repdocs = {i: [] for i in author_ids}

        # build up repdocs for each author
        for person_id, paper_id in author_df.itertuples():
            # Check if doc exists
            if paper_id in paper_df.doc.index:
              doc = paper_df.loc[paper_id]['doc']
              repdocs[person_id].append(doc)

        # save repdocs
        rows = ((person_id, '|'.join(docs))
                for person_id, docs in repdocs.iteritems())
        util.write_csv_to_fwrapper(self.output(), ('author_id', 'doc'), rows)
Ejemplo n.º 2
0
    def run(self):
        graph_file, idmap_file, paper_file, author_file = self.input()

        # Read in dependencies
        lcc = igraph.Graph.Read_GraphMLz(graph_file.path)
        author_venue_df = self.build_linked_venue_frame()
        venue_map = self.assign_venue_ids(author_venue_df)

        records = util.iter_csv_fwrapper(idmap_file)
        lcc_idmap = {record[0]: int(record[1]) for record in records}

        # Use sets in order to ensure uniqueness.
        for v in lcc.vs:
            v['venues'] = set()

        # Add the venue IDs to the node venue sets.
        for rownum, (author_id, venue) in author_venue_df.iterrows():
            node_id = lcc_idmap[str(author_id)]
            venue_id = venue_map[venue]
            lcc.vs[node_id]['venues'].add(venue_id)

        # Convert the sets to tuples.
        for v in lcc.vs:
            v['venues'] = tuple(v['venues'])

        # save a copy of the graph with venue info
        pickle_outfile, venue_map_outfile = self.output()
        lcc.write_picklez(pickle_outfile.path)  # lcc-author-citation-graph

        rows = ((vnum, venue) for venue, vnum in venue_map.iteritems())
        util.write_csv_to_fwrapper(venue_map_outfile,
                                   ('venue_id', 'venue_name'), rows)
Ejemplo n.º 3
0
    def run(self):
        refg = igraph.Graph()
        nodes = self.read_paper_vertices()
        refg.add_vertices(nodes)

        # Build and save paper id to node id mapping
        idmap = {str(v['name']): v.index for v in refg.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(self.idmap_output_file,
                                   ('paper_id', 'node_id'), rows)

        # Now add venues to nodes as paper attributes
        for paper_id, venue in self.read_paper_venues():
            node_id = idmap[paper_id]
            refg.vs[node_id]['venue'] = venue

        # next add author ids
        for v in refg.vs:
            v['author_ids'] = []

        for author_id, paper_id in util.iter_csv_fwrapper(self.author_file):
            node_id = idmap[paper_id]
            refg.vs[node_id]['author_ids'].append(author_id)

        # Finally add edges from citation records
        citation_links = self.read_paper_references(idmap)
        refg.add_edges(citation_links)

        # Save in both pickle and graphml formats
        refg.write_picklez(self.pickle_output_file.path)
        refg.write_graphmlz(self.graphml_output_file.path)
        return refg
Ejemplo n.º 4
0
    def run(self):
        graph_file, idmap_file, paper_file, author_file = self.input()

        # Read in dependencies
        lcc = igraph.Graph.Read_GraphMLz(graph_file.path)
        author_venue_df = self.build_linked_venue_frame()
        venue_map = self.assign_venue_ids(author_venue_df)

        records = util.iter_csv_fwrapper(idmap_file)
        lcc_idmap = {record[0]: int(record[1]) for record in records}

        # Use sets in order to ensure uniqueness.
        for v in lcc.vs:
            v['venues'] = set()

        # Add the venue IDs to the node venue sets.
        for rownum, (author_id, venue) in author_venue_df.iterrows():
            node_id = lcc_idmap[str(author_id)]
            venue_id = venue_map[venue]
            lcc.vs[node_id]['venues'].add(venue_id)

        # Convert the sets to tuples.
        for v in lcc.vs:
            v['venues'] = tuple(v['venues'])

        # save a copy of the graph with venue info
        pickle_outfile, venue_map_outfile = self.output()
        lcc.write_picklez(pickle_outfile.path)  # lcc-author-citation-graph

        rows = ((vnum, venue) for venue, vnum in venue_map.iteritems())
        util.write_csv_to_fwrapper(
            venue_map_outfile, ('venue_id', 'venue_name'), rows)
Ejemplo n.º 5
0
    def run(self):
        refg = igraph.Graph()
        nodes = self.read_paper_vertices()
        refg.add_vertices(nodes)

        # Build and save paper id to node id mapping
        idmap = {str(v['name']): v.index for v in refg.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(
            self.idmap_output_file, ('paper_id', 'node_id'), rows)

        # Now add venues to nodes as paper attributes
        for paper_id, venue in self.read_paper_venues():
            node_id = idmap[paper_id]
            refg.vs[node_id]['venue'] = venue

        # next add author ids
        for v in refg.vs:
            v['author_ids'] = []

        for author_id, paper_id in util.iter_csv_fwrapper(self.author_file):
            node_id = idmap[paper_id]
            refg.vs[node_id]['author_ids'].append(author_id)

        # Finally add edges from citation records
        citation_links = self.read_paper_references(idmap)
        refg.add_edges(citation_links)

        # Save in both pickle and graphml formats
        refg.write_picklez(self.pickle_output_file.path)
        refg.write_graphmlz(self.graphml_output_file.path)
        return refg
Ejemplo n.º 6
0
 def run(self):
     """The repdoc for a single paper consists of its title and abstract,
     concatenated with space between. The paper records are read from a csv
     file and written out as (paper_id, repdoc) pairs.
     """
     docs = self.read_paper_repdocs()
     rows = ((docid, doc.encode('utf-8')) for docid, doc in docs)
     util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)
Ejemplo n.º 7
0
 def run(self):
     """The repdoc for a single paper consists of its title and abstract,
     concatenated with space between. The paper records are read from a csv
     file and written out as (paper_id, repdoc) pairs.
     """
     docs = self.read_paper_repdocs()
     rows = ((docid, doc.encode('utf-8')) for docid, doc in docs)
     util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)
Ejemplo n.º 8
0
    def run(self):
        nodes = self.read_author_ids()
        edges = self.get_edges()
        authorg = util.build_undirected_graph(nodes, edges)

        # Now write the graph to gzipped graphml file.
        graph_output_file, idmap_output_file = self.output()
        authorg.write_graphmlz(graph_output_file.path)

        # Finally, build and save the ID map.
        idmap = {v['name']: v.index for v in authorg.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(idmap_output_file, ('author_id', 'node_id'),
                                   rows)
Ejemplo n.º 9
0
    def run(self):
        nodes = self.read_author_ids()
        edges = self.get_edges()
        authorg = util.build_undirected_graph(nodes, edges)

        # Now write the graph to gzipped graphml file.
        graph_output_file, idmap_output_file = self.output()
        authorg.write_graphmlz(graph_output_file.path)

        # Finally, build and save the ID map.
        idmap = {v['name']: v.index for v in authorg.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(
            idmap_output_file, ('author_id', 'node_id'), rows)
Ejemplo n.º 10
0
    def run(self):
        graphml_outfile, edgelist_outfile, idmap_outfile = self.output()
        author_graph_file, _ = self.input()

        # Read graph, find LCC, and save as graphml and edgelist
        authorg = igraph.Graph.Read_GraphMLz(author_graph_file.path)
        components = authorg.components()
        lcc = components.giant()
        lcc.write_graphmlz(graphml_outfile.path)
        lcc.write_edgelist(edgelist_outfile.path)

        # Build and save id map.
        idmap = {v['name']: v.index for v in lcc.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(idmap_outfile, ('author_id', 'node_id'),
                                   rows)
Ejemplo n.º 11
0
    def run(self):
        graphml_outfile, edgelist_outfile, idmap_outfile = self.output()
        author_graph_file, _ = self.input()

        # Read graph, find LCC, and save as graphml and edgelist
        authorg = igraph.Graph.Read_GraphMLz(author_graph_file.path)
        components = authorg.components()
        lcc = components.giant()
        lcc.write_graphmlz(graphml_outfile.path)
        lcc.write_edgelist(edgelist_outfile.path)

        # Build and save id map.
        idmap = {v['name']: v.index for v in lcc.vs}
        rows = sorted(idmap.items())
        util.write_csv_to_fwrapper(
            idmap_outfile, ('author_id', 'node_id'), rows)
Ejemplo n.º 12
0
    def run(self):
        paper_repdocs_file, author_file = self.input()

        with paper_repdocs_file.open() as pfile:
            paper_df = pd.read_csv(pfile, index_col=(0,))
            paper_df.fillna('', inplace=True)

        # read out authorship records
        with author_file.open() as afile:
            author_df = pd.read_csv(afile, header=0, index_col=(0,))

        # initialize repdoc dictionary from complete list of person ids
        author_ids = author_df.index.unique()
        repdocs = {i: [] for i in author_ids}

        # build up repdocs for each author
        for person_id, paper_id in author_df.itertuples():
            doc = paper_df.loc[paper_id]['doc']
            repdocs[person_id].append(doc)

        # save repdocs
        rows = ((person_id, '|'.join(docs))
                for person_id, docs in repdocs.iteritems())
        util.write_csv_to_fwrapper(self.output(), ('author_id', 'doc'), rows)
Ejemplo n.º 13
0
 def run(self):
     authorships = self.iter_authorships()
     util.write_csv_to_fwrapper(self.output(), ('author_id', 'paper_id'),
                                authorships)
Ejemplo n.º 14
0
 def run(self):
     authorships = self.iter_authorships()
     util.write_csv_to_fwrapper(
         self.output(), ('author_id', 'paper_id'), authorships)
Ejemplo n.º 15
0
 def run(self):
     author_rows = self.read_author_id_name_pairs()
     util.write_csv_to_fwrapper(self.output(), ('id', 'name'), author_rows)
Ejemplo n.º 16
0
 def run(self):
     repdocs = util.iter_csv_fwrapper(self.input())
     docs = ((docid, doc.decode('utf-8')) for docid, doc in repdocs)
     vecs = ((docid, doctovec.vectorize(doc)) for docid, doc in docs)
     rows = ((docid, '|'.join(doc).encode('utf-8')) for docid, doc in vecs)
     util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)
Ejemplo n.º 17
0
 def run(self):
     repdocs = util.iter_csv_fwrapper(self.input())
     docs = ((docid, doc.decode('utf-8')) for docid, doc in repdocs)
     vecs = ((docid, doctovec.vectorize(doc)) for docid, doc in docs)
     rows = ((docid, '|'.join(doc).encode('utf-8')) for docid, doc in vecs)
     util.write_csv_to_fwrapper(self.output(), ('paper_id', 'doc'), rows)
Ejemplo n.º 18
0
 def run(self):
     author_rows = self.read_author_id_name_pairs()
     util.write_csv_to_fwrapper(self.output(), ('id', 'name'), author_rows)