def make_citation_graphs(self, **kwargs): """ For each time period, writes a file representing a graph in which each line represents an author citing another author. The edges are weighted, which is inversely proportional to the number of authors in the cited work. Parameters ---------- resolution: str If 'year', the time period is year, creating a graph file per year. If 'month', the time period is month, creating a folder for each year within files for each month with data. Default: 'year'. from_year: int For time range, considering works published from this year onward. Default: 0. until_year: int For time range, considering works published until this year. Default: inf. Returns ------- dict: Dictionary with paths for all created files. """ resolution = kwargs.get("resolution", "year") from_year = kwargs.get("from_year", 0) until_year = kwargs.get("until_year", float("inf")) grouped_works = self.group_by_time(self.works, resolution=resolution) citation_graphs_dir = "%s/citation_graphs" % self.output_dir_path set_dir(citation_graphs_dir) created_files = [] # For each time mark T for (ref_date, works_list) in grouped_works: if ref_date.year >= from_year and ref_date.year < until_year: LOGGER.info("Building citation graph %s", ref_date) graph_file_name = self.get_graph_file_name(citation_graphs_dir, ref_date, resolution, "citations") self._make_graph({}, graph_file_name, header=["author_i", "author_j", "weight"]) # For each work i in time T for work_id in works_list: self.citations_graph(work_id, graph_file_name) created_files.append(graph_file_name) self.sum_edges(graph_file_name) LOGGER.info("Graph stored at %s", graph_file_name) with open("%s/files.json" % citation_graphs_dir, "wb") as files: files.write(json.dumps(created_files)) return created_files
def load_citations(self): """ Loads relation of cited works from csv file, in which each line represents a work citing another. """ line_counter = 0 not_listed = {} LOGGER.info("Loading citations!") before = time.time() with open(self.citation_csv_path) as csv_file: first_line = True for line in csv_file: line_counter += 1 if line_counter % 50000 == 0: LOGGER.debug("Line # %d", line_counter) # Avoids first line comment if not first_line: source_id, target_id = line.rstrip("\n").split(",") # ensuring source and target are known works if source_id in self.works_map and target_id in self.works_map: source = self.works_map[source_id] target = self.works_map[target_id] self.works[source][WORK_INFO][CITED_WORKS].append(target) else: if source_id not in not_listed: not_listed[source_id] = 0 not_listed[source_id] += 1 first_line = False dump(not_listed, "%s/%s.json" % (self.output_dir_path, "non_listed")) LOGGER.info("Non-listed works: %d", len(not_listed.keys())) LOGGER.info("%d citations loaded after %f seconds", line_counter, time.time() - before)
def sort_elements(self): """ Sorting works list by publication date, and for each work, sorting the list of authors. """ LOGGER.info("Sorting elements...") before = time.time() # Sorting works by date self.works.sort() # Mapping work_id and their respective index in list for work_idx in xrange(len(self.works)): work_id = self.works[work_idx][WORK_INFO].pop(WORK_ID) self.works_map[work_id] = work_idx self.works[work_id][WORK_INFO][AUTHORS_LIST].sort() LOGGER.info("Elements sorted after %f seconds", time.time() - before)
def get_graph_file_name(output_dir, ref_date, resolution, g_type): """ """ # Creating one folder per year if resolution == "month": graph_dir = set_dir("%s/%d" % (output_dir, ref_date.year)) graph_file_name = "%s/aps_%s_%d_%d.csv" % (graph_dir, g_type, ref_date.year, ref_date.month) if resolution == "year": graph_dir = set_dir(output_dir) graph_file_name = "%s/aps_%s_%d.csv" % (graph_dir, g_type, ref_date.year) LOGGER.info(graph_file_name) return graph_file_name
def _get_work_info(self, file_path): """ Returns publication_id and a dictionary with authors list, publication date and an empt list for cited_works. This method also updates the authors dict, which holds each author identifier. """ file_data = json.load(open(file_path)) authors_list = [] # Listing authors of publications, handling possible Editorials try: work_authors = file_data["authors"] for author in work_authors: # Handling error if author does not have a name in json try: # Assigns an id for the author if isinstance(author, dict) and author["name"] not in self.authors_map: self.authors_map[author["name"]] = self.authors_count self.authors_count += 1 author_idx = self.authors_map[author["name"]] # Inserts author id in list of authors authors_list.append(author_idx) except KeyError: LOGGER.error("Work: %s, Bug: unnamed author %s", file_path, str(author)) except TypeError, exc: LOGGER.error("Work: %s, Bug: %s", file_path, exc) return None, None except KeyError: LOGGER.error("Work: %s, Alert: 0 authors!", file_path) return None, None return (file_data["date"], [authors_list, [], file_data["id"]])
def find_works(self): """ Loads all works in json files in the metadata folder and their respective sub-directories. The work information is stored in `self.works`. """ overview = {"Publishers": 0, "Works": 0, "Retrieved": 0} LOGGER.info("Searching for works...") before = time.time() # List of publishers for dir_name in os.listdir(self.works_dir_path): dir_path = "%s/%s" % (self.works_dir_path, dir_name) overview["Publishers"] += 1 LOGGER.debug("dir # %d: %s", overview["Publishers"], dir_name) # List of editions for sub_dir_name in os.listdir(dir_path): sub_dir_path = "%s/%s" % (dir_path, sub_dir_name) # List of works in this edition for file_name in os.listdir(sub_dir_path): overview["Works"] += 1 file_path = "%s/%s" % (sub_dir_path, file_name) # Gets publication_date and author list for each work work_date, work_info = self._get_work_info(file_path) # If data is fine, hence work_date is not None if work_date: overview["Retrieved"] += 1 self.works.append((work_date, work_info)) # Exporting overview info dump(overview, "%s/%s.txt" % (self.output_dir_path, "aps_works_overview")) LOGGER.info("Loaded %d works after %f seconds", overview["Retrieved"], time.time() - before) self.sort_elements() self.authors_map = None