class PaperAbstractCrawler(AbstractCrawler): """ Class : PaperNameCrawler Description: Paper Name crawler crawls the name of paper from Tuhat Database of University of Helsinki given name of the author. """ def __init__(self): """ Set up mysql messenger @param self Pointer to class """ super(PaperAbstractCrawler, self).__init__() self.mm = MysqlMessager("PaperLinks") self.libraries = [] def crawl(self): """ crawl information from page @param self Pointer to class """ import time import random #self.mm.clear_table() sql = "SELECT PaperNames.PaperName,PaperNames.Paper_ID, Persons.FirstName, LastName FROM PaperNames inner join Persons on PaperNames.P_ID = Persons.ID" self.mm.execute_sql(sql) iter = self.mm.fetch() for row in iter: if row[1] > 795: self.name = row[0] url_name = '+'.join(self.name.split(' ')) url = "http://scholar.google.fi/scholar?as_q=%s&as_occt=title&hl=en" % url_name print url try: soup = self._downloader(url) self._parse_and_store(soup, row[1]) except Exception, e: print e time.sleep(random.randint(60, 130))
class PaperNameCrawler(AbstractCrawler): """ Class : PaperNameCrawler Description: Paper Name crawler crawls the name of paper from Tuhat Database of University of Helsinki given name of the author. """ def __init__(self): """ Set up mysql messenger @param self Pointer to class """ super(PaperNameCrawler, self).__init__() self.mm = MysqlMessager("PaperNames") def crawl(self): """ crawl information from page @param self Pointer to class """ self.mm.clear_table() sql = "SELECT * FROM Persons" self.mm.execute_sql(sql) iter = self.mm.fetch() for row in iter: url = row[3] soup = self._downloader(url) self._parse_and_store(soup, row[0]) def _downloader(self, url, out_folder="doc/"): """ Download the web page and store it python data structure @param self Pointer to class @param url URL to be downloaded @param out_folder Folder that stores information """ "" return super(PaperNameCrawler, self)._downloader(url) def _parse_and_store(self, soup, foreign_key): def doi2url(doi): """ Return a bibTeX string of metadata for a given DOI. ##TODO: Not working for now """ try: link = urlopen(doi).geturl() except Exception, e: # Error occured while resolving doi address print "Exception happened while processing doi: %s" % e link = doi print link return link super(PaperNameCrawler, self)._parse_and_store(soup) print self.log_dir log_file = codecs.open( self.log_dir + "paper_name_crawler_log_file.txt", "w", "utf-8") for p in soup.findAll('p', {'class': 'uh_relationlist'}): inner_soup = self._downloader(p.a['href']) if 'publications.html' == p.a['href'].split('/')[-1]: for inner_link in inner_soup.findAll('h2', {'class': 'title'}): paper_names = inner_link.a.span.contents[0] paper_link = inner_link.a['href'] paper_soup = self._downloader(paper_link) paper_out_link_resolved = "default" try: doi = paper_soup.findAll( 'ul', {'class': 'relations digital_object_identifiers' })[0].li.a['href'] paper_out_link_resolved = u"\"" + doi2url(doi) + u"\"" #paper_out_link_resolved = doi2url(doi) time.sleep(60) except: # can not find any links in the web page try: # then tries to find whether there is any link connects to the paper for h in paper_soup.findAll( 'h3', {'class': 'subheader'}): if h.contents[0] == "Links": print "Links" paper_out_link_resolved = u"\"" + h.parent.ul.li.a[ 'href'] + u"\"" except: # can not find links either pass sql = u"INSERT INTO PaperNames (Paper_ID, PaperName, Link, P_ID) VALUES ( default,\"" +\ paper_names + u"\", " + paper_out_link_resolved + u", " + str(foreign_key) + u")" print sql self.mm.execute_sql(sql, log_file) log_file.close()