Esempio n. 1
0
 def parse_paperauthors(self):
     # Update all journal/conference/coauthor information
     print "Parsing PaperAuthors..."
     f = open(data_io.get_paths()["paperauthor_processed_path"], "r")
     titles = f.readline()
     count = 0
     for l in f:
         count += 1
         if count % 100000 == 0:
             print count
         res = l.strip().split(",")
         if not res[0].isdigit():
             continue
         paper_id = int(res[0])
         author_id = int(res[1])
         raw_author_name = unidecode.unidecode(
             unicode(res[2], encoding="utf-8"))
         author_name = nlp.filter_title(raw_author_name)[0]
         raw_author_affiliation = unidecode.unidecode(
             unicode(res[3], encoding="utf-8"))
         author_affiliation = nlp.filter_affiliation(raw_author_affiliation)
         curr_paper = self.papers.get(paper_id)
         curr_author = self.authors.get(author_id)
         self.update_paperauthor(curr_paper, curr_author, author_id,
                                 author_name, author_affiliation)
     print "Done"
     f.close()
    def parse_authors(self):
        # Create authors
        print "Parsing Authors..."
        f = open(data_io.get_paths()["author_processed_path"], "r")
        titles = f.readline()
        for l in f.readlines():
            res = l.strip().split(",")
            # Titles
            raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
            (name, surname) = nlp.filter_title(raw_title)
            try:
                self.surnames[surname] = self.surnames[surname] + 1
            except:
                self.surnames[surname] = 1

            #Affiliations
            raw_affiliation = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
            affiliation = nlp.filter_affiliation(raw_affiliation)
            try:
                self.affiliations[affiliation] = self.affiliations[affiliation] + 1
            except:
                self.affiliations[affiliation] = 1
            self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation)

        print "Done"
        f.close()
Esempio n. 3
0
    def parse_authors(self):
        # Create authors
        print "Parsing Authors..."
        f = open(data_io.get_paths()["author_processed_path"], "r")
        titles = f.readline()
        for l in f.readlines():
            res = l.strip().split(",")
            # Titles
            raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
            (name, surname) = nlp.filter_title(raw_title)
            try:
                self.surnames[surname] = self.surnames[surname] + 1
            except:
                self.surnames[surname] = 1

            #Affiliations
            raw_affiliation = unidecode.unidecode(
                unicode(res[2], encoding="utf-8"))
            affiliation = nlp.filter_affiliation(raw_affiliation)
            try:
                self.affiliations[
                    affiliation] = self.affiliations[affiliation] + 1
            except:
                self.affiliations[affiliation] = 1
            self.authors[int(res[0])] = author.Author(int(res[0]), name,
                                                      surname, affiliation)

        print "Done"
        f.close()
 def parse_paperauthors(self):
     # Update all journal/conference/coauthor information
     print "Parsing PaperAuthors..."
     f = open(data_io.get_paths()["paperauthor_processed_path"], "r")
     titles = f.readline()
     count = 0
     for l in f:
         count += 1
         if count % 100000 == 0:
           print count
         res = l.strip().split(",")
         if not res[0].isdigit():
           continue
         paper_id = int(res[0])
         author_id = int(res[1])
         raw_author_name = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
         author_name = nlp.filter_title(raw_author_name)[0]
         raw_author_affiliation = unidecode.unidecode(unicode(res[3], encoding="utf-8"))
         author_affiliation = nlp.filter_affiliation(raw_author_affiliation)
         curr_paper = self.papers.get(paper_id)
         curr_author = self.authors.get(author_id)
         self.update_paperauthor(curr_paper, curr_author, author_id, author_name, author_affiliation)
     print "Done"
     f.close()