예제 #1
0
    def make_citation_graphs(self, **kwargs):
        """
        For each time period, writes a file representing a graph in which
        each line represents an author citing another author. The edges are
        weighted, which is inversely proportional to the number of authors
        in the cited work.

        Parameters
        ----------
        resolution: str
            If 'year', the time period is year, creating a graph file per year.
            If 'month', the time period is month, creating a folder for each
            year within files for each month with data.
            Default: 'year'.
        from_year: int
            For time range, considering works published from this year onward.
            Default: 0.
        until_year: int
            For time range, considering works published until this year.
            Default: inf.

        Returns
        -------
        dict:
            Dictionary with paths for all created files.
        """
        resolution = kwargs.get("resolution", "year")
        from_year = kwargs.get("from_year", 0)
        until_year = kwargs.get("until_year", float("inf"))
        grouped_works = self.group_by_time(self.works, resolution=resolution)
        citation_graphs_dir = "%s/citation_graphs" % self.output_dir_path
        set_dir(citation_graphs_dir)
        created_files = []
        # For each time mark T
        for (ref_date, works_list) in grouped_works:
            if ref_date.year >= from_year and ref_date.year < until_year:
                LOGGER.info("Building citation graph %s", ref_date)
                graph_file_name = self.get_graph_file_name(citation_graphs_dir,
                                                           ref_date,
                                                           resolution,
                                                           "citations")
                self._make_graph({},
                                 graph_file_name,
                                 header=["author_i", "author_j", "weight"])
                # For each work i in time T
                for work_id in works_list:
                    self.citations_graph(work_id, graph_file_name)
                created_files.append(graph_file_name)
                self.sum_edges(graph_file_name)
                LOGGER.info("Graph stored at %s", graph_file_name)
        with open("%s/files.json" % citation_graphs_dir, "wb") as files:
            files.write(json.dumps(created_files))
        return created_files
예제 #2
0
 def load_citations(self):
     """
     Loads relation of cited works from csv file, in which each line represents
     a work citing another.
     """
     line_counter = 0
     not_listed = {}
     LOGGER.info("Loading citations!")
     before = time.time()
     with open(self.citation_csv_path) as csv_file:
         first_line = True
         for line in csv_file:
             line_counter += 1
             if line_counter % 50000 == 0:
                 LOGGER.debug("Line # %d", line_counter)
             # Avoids first line comment
             if not first_line:
                 source_id, target_id = line.rstrip("\n").split(",")
                 # ensuring source and target are known works
                 if source_id in self.works_map and target_id in self.works_map:
                     source = self.works_map[source_id]
                     target = self.works_map[target_id]
                     self.works[source][WORK_INFO][CITED_WORKS].append(target)
                 else:
                     if source_id not in not_listed:
                         not_listed[source_id] = 0
                     not_listed[source_id] += 1
             first_line = False
     dump(not_listed, "%s/%s.json" % (self.output_dir_path, "non_listed"))
     LOGGER.info("Non-listed works: %d", len(not_listed.keys()))
     LOGGER.info("%d citations loaded after %f seconds", line_counter, time.time() - before)
예제 #3
0
 def sort_elements(self):
     """
     Sorting works list by publication date, and for each work, sorting
     the list of authors.
     """
     LOGGER.info("Sorting elements...")
     before = time.time()
     # Sorting works by date
     self.works.sort()
     # Mapping work_id and their respective index in list
     for work_idx in xrange(len(self.works)):
         work_id = self.works[work_idx][WORK_INFO].pop(WORK_ID)
         self.works_map[work_id] = work_idx
         self.works[work_id][WORK_INFO][AUTHORS_LIST].sort()
     LOGGER.info("Elements sorted after %f seconds", time.time() - before)
예제 #4
0
 def get_graph_file_name(output_dir, ref_date, resolution, g_type):
     """
     """
     # Creating one folder per year
     if resolution == "month":
         graph_dir = set_dir("%s/%d" % (output_dir, ref_date.year))
         graph_file_name = "%s/aps_%s_%d_%d.csv" % (graph_dir,
                                                    g_type,
                                                    ref_date.year,
                                                    ref_date.month)
     if resolution == "year":
         graph_dir = set_dir(output_dir)
         graph_file_name = "%s/aps_%s_%d.csv" % (graph_dir,
                                                 g_type,
                                                 ref_date.year)
         LOGGER.info(graph_file_name)
     return graph_file_name
예제 #5
0
 def _get_work_info(self, file_path):
     """
     Returns publication_id and a dictionary with authors list, publication
     date and an empt list for cited_works. This method also updates the
     authors dict, which holds each author identifier.
     """
     file_data = json.load(open(file_path))
     authors_list = []
     # Listing authors of publications, handling possible Editorials
     try:
         work_authors = file_data["authors"]
         for author in work_authors:
             # Handling error if author does not have a name in json
             try:
                 # Assigns an id for the author
                 if isinstance(author, dict) and author["name"] not in self.authors_map:
                     self.authors_map[author["name"]] = self.authors_count
                     self.authors_count += 1
                 author_idx = self.authors_map[author["name"]]
                 # Inserts author id in list of authors
                 authors_list.append(author_idx)
             except KeyError:
                 LOGGER.error("Work: %s, Bug: unnamed author %s", file_path, str(author))
             except TypeError, exc:
                 LOGGER.error("Work: %s, Bug: %s", file_path, exc)
                 return None, None
     except KeyError:
         LOGGER.error("Work: %s, Alert: 0 authors!", file_path)
         return None, None
     return (file_data["date"], [authors_list, [], file_data["id"]])
예제 #6
0
 def find_works(self):
     """
     Loads all works in json files in the metadata folder and their
     respective sub-directories. The work information is stored in
     `self.works`.
     """
     overview = {"Publishers": 0,
                 "Works": 0,
                 "Retrieved": 0}
     LOGGER.info("Searching for works...")
     before = time.time()
     # List of publishers
     for dir_name in os.listdir(self.works_dir_path):
         dir_path = "%s/%s" % (self.works_dir_path, dir_name)
         overview["Publishers"] += 1
         LOGGER.debug("dir # %d: %s", overview["Publishers"], dir_name)
         # List of editions
         for sub_dir_name in os.listdir(dir_path):
             sub_dir_path = "%s/%s" % (dir_path, sub_dir_name)
             # List of works in this edition
             for file_name in os.listdir(sub_dir_path):
                 overview["Works"] += 1
                 file_path = "%s/%s" % (sub_dir_path, file_name)
                 # Gets publication_date and author list for each work
                 work_date, work_info = self._get_work_info(file_path)
                 # If data is fine, hence work_date is not None
                 if work_date:
                     overview["Retrieved"] += 1
                     self.works.append((work_date, work_info))
     # Exporting overview info
     dump(overview, "%s/%s.txt" % (self.output_dir_path,
                                   "aps_works_overview"))
     LOGGER.info("Loaded %d works after %f seconds", overview["Retrieved"],
                                                     time.time() - before)
     self.sort_elements()
     self.authors_map = None