def parse_paperauthors(self): # Update all journal/conference/coauthor information print "Parsing PaperAuthors..." f = open(data_io.get_paths()["paperauthor_processed_path"], "r") titles = f.readline() count = 0 for l in f: count += 1 if count % 100000 == 0: print count res = l.strip().split(",") if not res[0].isdigit(): continue paper_id = int(res[0]) author_id = int(res[1]) raw_author_name = unidecode.unidecode( unicode(res[2], encoding="utf-8")) author_name = nlp.filter_title(raw_author_name)[0] raw_author_affiliation = unidecode.unidecode( unicode(res[3], encoding="utf-8")) author_affiliation = nlp.filter_affiliation(raw_author_affiliation) curr_paper = self.papers.get(paper_id) curr_author = self.authors.get(author_id) self.update_paperauthor(curr_paper, curr_author, author_id, author_name, author_affiliation) print "Done" f.close()
def parse_authors(self): # Create authors print "Parsing Authors..." f = open(data_io.get_paths()["author_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") # Titles raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) (name, surname) = nlp.filter_title(raw_title) try: self.surnames[surname] = self.surnames[surname] + 1 except: self.surnames[surname] = 1 #Affiliations raw_affiliation = unidecode.unidecode(unicode(res[2], encoding="utf-8")) affiliation = nlp.filter_affiliation(raw_affiliation) try: self.affiliations[affiliation] = self.affiliations[affiliation] + 1 except: self.affiliations[affiliation] = 1 self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation) print "Done" f.close()
def parse_authors(self): # Create authors print "Parsing Authors..." f = open(data_io.get_paths()["author_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") # Titles raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) (name, surname) = nlp.filter_title(raw_title) try: self.surnames[surname] = self.surnames[surname] + 1 except: self.surnames[surname] = 1 #Affiliations raw_affiliation = unidecode.unidecode( unicode(res[2], encoding="utf-8")) affiliation = nlp.filter_affiliation(raw_affiliation) try: self.affiliations[ affiliation] = self.affiliations[affiliation] + 1 except: self.affiliations[affiliation] = 1 self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation) print "Done" f.close()
def parse_paperauthors(self): # Update all journal/conference/coauthor information print "Parsing PaperAuthors..." f = open(data_io.get_paths()["paperauthor_processed_path"], "r") titles = f.readline() count = 0 for l in f: count += 1 if count % 100000 == 0: print count res = l.strip().split(",") if not res[0].isdigit(): continue paper_id = int(res[0]) author_id = int(res[1]) raw_author_name = unidecode.unidecode(unicode(res[2], encoding="utf-8")) author_name = nlp.filter_title(raw_author_name)[0] raw_author_affiliation = unidecode.unidecode(unicode(res[3], encoding="utf-8")) author_affiliation = nlp.filter_affiliation(raw_author_affiliation) curr_paper = self.papers.get(paper_id) curr_author = self.authors.get(author_id) self.update_paperauthor(curr_paper, curr_author, author_id, author_name, author_affiliation) print "Done" f.close()