def parse_papers(self): # Create Papers print "Parsing Papers..." f = open(data_io.get_paths()["paper_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") paper_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) title_words = nlp.filter_paper_title(paper_title) paper_keyword = unidecode.unidecode( unicode(res[5], encoding="utf-8")) filtered_keyword = nlp.filter_paper_keyword(paper_keyword) self.papers[int(res[0])] = paper.Paper(int(res[0]), title_words, int(res[2]), int(res[3]), int(res[4]), filtered_keyword) for tt in title_words.split(): try: self.paper_titles[tt] = self.paper_titles[tt] + 1 except: self.paper_titles[tt] = 1 print "Done" f.close()
def parse_journals(self): print "Parsing Journals..." f = open(data_io.get_paths()["journal_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") journal_id = int(res[0]) raw_journal_title = unidecode.unidecode(unicode(res[2], encoding="utf-8")) journal_title = nlp.filter_paper_title(raw_journal_title) self.journals[journal_id] = journal_title for j in journal_title.split(): if j in self.journal_freq.keys(): self.journal_freq[j] = self.journal_freq[j] + 1 else: self.journal_freq[j] = 1
def parse_conferences(self): print "Parsing Conferences..." f = open(data_io.get_paths()["conference_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") conference_id = int(res[0]) raw_conference_title = unidecode.unidecode(unicode(res[2], encoding="utf-8")) conference_title = nlp.filter_paper_title(raw_conference_title) self.conferences[conference_id] = conference_title for c in conference_title.split(): if c in self.conference_freq.keys(): self.conference_freq[c] = self.conference_freq[c] + 1 else: self.conference_freq[c] = 1
def parse_journals(self): print "Parsing Journals..." f = open(data_io.get_paths()["journal_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") journal_id = int(res[0]) raw_journal_title = unidecode.unidecode( unicode(res[2], encoding="utf-8")) journal_title = nlp.filter_paper_title(raw_journal_title) self.journals[journal_id] = journal_title for j in journal_title.split(): if j in self.journal_freq.keys(): self.journal_freq[j] = self.journal_freq[j] + 1 else: self.journal_freq[j] = 1
def parse_conferences(self): print "Parsing Conferences..." f = open(data_io.get_paths()["conference_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") conference_id = int(res[0]) raw_conference_title = unidecode.unidecode( unicode(res[2], encoding="utf-8")) conference_title = nlp.filter_paper_title(raw_conference_title) self.conferences[conference_id] = conference_title for c in conference_title.split(): if c in self.conference_freq.keys(): self.conference_freq[c] = self.conference_freq[c] + 1 else: self.conference_freq[c] = 1
def parse_papers(self): # Create Papers print "Parsing Papers..." f = open(data_io.get_paths()["paper_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") paper_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) title_words = nlp.filter_paper_title(paper_title) paper_keyword = unidecode.unidecode(unicode(res[5], encoding="utf-8")) filtered_keyword = nlp.filter_paper_keyword(paper_keyword) self.papers[int(res[0])] = paper.Paper(int(res[0]), title_words, int(res[2]), int(res[3]), int(res[4]), filtered_keyword) for tt in title_words.split(): try: self.paper_titles[tt] = self.paper_titles[tt] + 1 except: self.paper_titles[tt] = 1 print "Done" f.close()