def __get_paper_from_acm (self, entry_url): resp_body = self.op.open (entry_url).read () root = sp.fromstring (resp_body) divmain = root.xpath ("//div[@id='divmain']")[0] title = divmain.xpath ("div/h1/strong")[0].text # use regex to extract abstract link abst_url = re.compile (r"tab.abstract.cfm[^']*").search (resp_body).group (0) abst_url = 'http://dl.acm.org/' + abst_url abst_body = self.op.open (abst_url).read () # extract all text node from this dom tree abst = ''.join (sp.fromstring (abst_body).xpath ('//div/p/div/p/descendant-or-self::*/text()')) # instantiate a Paper class paper = Paper (title, abst) # locate the author table block author_table = divmain.xpath ("table/tr/td/table")[1] # add each author for author_row in author_table.xpath ('tr'): name = author_row.xpath ('td/a/text()')[0] affn = author_row.xpath ('td/a/small/text()')[0] paper.add_author (Author (name, affn)) return paper
def __get_paper_from_ms (self, entry_url): resp_body = self.__deljs_html (self.op.open (entry_url).read ()) root = sp.fromstring (resp_body) title = root.xpath ("//span[@id='ctl00_MainContent_PaperItem_title']")[0].text #abst = root.xpath ("//span[@id='ctl00_MainContent_PaperItem_snippet']")[0].text # instantiate a Paper class paper = Paper (title) # locate the div block for the paper description paper_div = root.xpath ("//div[@id='ctl00_MainContent_PaperItem_divPaper']/div")[1] for author_url in paper_div.xpath ("a[@class='author-name-tooltip']/@href"): # print author_url paper.add_author (self.__get_author_from_ms (author_url)) return paper
def __get_paper_from_acm (self, entry_url): resp_body = self.__deljs_html (self.op.open (self.__wrapper (entry_url)).read ()) root = sp.fromstring (resp_body) divmain = root.xpath ("//div[@id='divmain']")[0] title = divmain.xpath ("div/h1/strong")[0].text # UPDATE: NO NEED FOR ABSTRACT # use regex to extract abstract link #abst_url = re.compile (r"tab.abstract.cfm[^']*").search (resp_body).group (0) #abst_url = 'http://dl.acm.org/' + abst_url #abst_body = self.op.open (abst_url).read () # extract all text node from this dom tree #abst = ''.join (sp.fromstring (abst_body).xpath ('//div/p/div/p/descendant-or-self::*/text()')) # instantiate a Paper class paper = Paper (title) # locate the author table block author_table = divmain.xpath ("table/tr/td/table")[1] # add each author for author_row in author_table.xpath ('tr'): name = author_row.xpath ('td/a/text()')[0] # if the text is in tag <a>, then it has a link to this affiliation if len (author_row.xpath ('td/a/small/text()')) > 0: affn = author_row.xpath ('td/a/small/text()')[0] elif len (author_row.xpath ('td/small/text()')) > 0: affn = author_row.xpath ('td/small/text()')[0] else: affn = "" paper.add_author (Author (name, affn)) return paper