Beispiel #1
0
    def parse_papers(self):
        # Create Papers
        print "Parsing Papers..."
        f = open(data_io.get_paths()["paper_processed_path"], "r")
        titles = f.readline()
        for l in f.readlines():
            res = l.strip().split(",")
            paper_title = unidecode.unidecode(unicode(res[1],
                                                      encoding="utf-8"))
            title_words = nlp.filter_paper_title(paper_title)
            paper_keyword = unidecode.unidecode(
                unicode(res[5], encoding="utf-8"))
            filtered_keyword = nlp.filter_paper_keyword(paper_keyword)
            self.papers[int(res[0])] = paper.Paper(int(res[0]), title_words,
                                                   int(res[2]), int(res[3]),
                                                   int(res[4]),
                                                   filtered_keyword)

            for tt in title_words.split():
                try:
                    self.paper_titles[tt] = self.paper_titles[tt] + 1
                except:
                    self.paper_titles[tt] = 1

        print "Done"
        f.close()
 def parse_journals(self):
     print "Parsing Journals..."
     f = open(data_io.get_paths()["journal_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         journal_id = int(res[0])
         raw_journal_title = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
         journal_title = nlp.filter_paper_title(raw_journal_title)
         self.journals[journal_id] = journal_title
         for j in journal_title.split():
             if j in self.journal_freq.keys():
                 self.journal_freq[j] = self.journal_freq[j] + 1
             else:
                 self.journal_freq[j] = 1
 def parse_conferences(self):
     print "Parsing Conferences..."
     f = open(data_io.get_paths()["conference_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         conference_id = int(res[0])
         raw_conference_title = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
         conference_title = nlp.filter_paper_title(raw_conference_title)
         self.conferences[conference_id] = conference_title
         for c in conference_title.split():
             if c in self.conference_freq.keys():
                 self.conference_freq[c] = self.conference_freq[c] + 1
             else:
                 self.conference_freq[c] = 1
Beispiel #4
0
 def parse_journals(self):
     print "Parsing Journals..."
     f = open(data_io.get_paths()["journal_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         journal_id = int(res[0])
         raw_journal_title = unidecode.unidecode(
             unicode(res[2], encoding="utf-8"))
         journal_title = nlp.filter_paper_title(raw_journal_title)
         self.journals[journal_id] = journal_title
         for j in journal_title.split():
             if j in self.journal_freq.keys():
                 self.journal_freq[j] = self.journal_freq[j] + 1
             else:
                 self.journal_freq[j] = 1
Beispiel #5
0
 def parse_conferences(self):
     print "Parsing Conferences..."
     f = open(data_io.get_paths()["conference_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         conference_id = int(res[0])
         raw_conference_title = unidecode.unidecode(
             unicode(res[2], encoding="utf-8"))
         conference_title = nlp.filter_paper_title(raw_conference_title)
         self.conferences[conference_id] = conference_title
         for c in conference_title.split():
             if c in self.conference_freq.keys():
                 self.conference_freq[c] = self.conference_freq[c] + 1
             else:
                 self.conference_freq[c] = 1
 def parse_papers(self):
     # Create Papers
     print "Parsing Papers..."
     f = open(data_io.get_paths()["paper_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         paper_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
         title_words = nlp.filter_paper_title(paper_title)
         paper_keyword = unidecode.unidecode(unicode(res[5], encoding="utf-8"))
         filtered_keyword = nlp.filter_paper_keyword(paper_keyword)
         self.papers[int(res[0])] = paper.Paper(int(res[0]), title_words, int(res[2]), int(res[3]), int(res[4]), filtered_keyword)
         
         for tt in title_words.split():
           try:
             self.paper_titles[tt] = self.paper_titles[tt] + 1
           except:
             self.paper_titles[tt] = 1
         
     print "Done"
     f.close()