class CourseContentCrawler(AbstractCrawler): def __init__(self): super(AbstractCrawler, self).__init__() self.mm = MysqlMessager("CourseDescription") self.libraries = [] def _downloader(self, url, out_folder="doc/"): """ Download the web page and store it python data structure @param self Pointer to class @param url URL to be downloaded @param out_folder Folder that stores information """ "" hdr = {'User-Agent': 'Mozilla/5.0'} req = Request(url, headers=hdr) page = urlopen(req) soup = bs(page) return soup def crawl(self): """ crawl information from page @param self Pointer to class """ self.mm.clear_table() url = "http://www.cs.helsinki.fi/en/courses?y=2014&s%5B%5D=K&l%5B%5D=E" try: soup = self._downloader(url) self._parse_and_store(soup) except Exception, e: print e
def __init__(self): """ Set up mysql messenger @param self Pointer to class """ super(PaperAbstractCrawler, self).__init__() self.mm = MysqlMessager("PaperLinks") self.libraries = []
class NameCrawler(AbstractCrawler): def __init__(self): """ Set up mysql messenger @param self Pointer to class """ super(NameCrawler, self).__init__() self.mm = MysqlMessager("Persons") def __repr__(self): """ Show descriptions of this class @param self Pointer to class """ return "<NameCrawler name:%s>" % self.name def _parse_and_store(self, soup): """ Parses the web page provided to crawl and store them in database @param self Pointer to class @param soup Structured data to be parsed """ super(NameCrawler, self)._parse_and_store(soup) log_file = codecs.open(self.log_dir + "name_crawler_log_file.txt", "w", "utf-8") self.mm.clear_table() for link in soup.findAll("a"): if link.has_key('rel') and 'Person' == link['rel']: names = link.span.contents[0].split(',') href = link['href'] sql = u"INSERT INTO Persons (ID, FirstName, LastName, Link) VALUES (default, \"" + names[0]+u"\",\"" +\ names[1] + u"\",\"" + href + u"\")" self.mm.execute_sql(sql, log_file) log_file.close() def _downloader(self, url, out_folder="doc/"): """ Download the web page and store it python data structure @param self Pointer to class @param url URL to be downloaded @param out_folder Folder that stores information """ "" return super(NameCrawler, self)._downloader(url) def crawl(self, url): """ crawl informations from page @param self Pointer to class @param url URL to be downloaded """ soup = self._downloader(url) self._parse_and_store(soup)
class PaperAbstractCrawler(AbstractCrawler): """ Class : PaperNameCrawler Description: Paper Name crawler crawls the name of paper from Tuhat Database of University of Helsinki given name of the author. """ def __init__(self): """ Set up mysql messenger @param self Pointer to class """ super(PaperAbstractCrawler, self).__init__() self.mm = MysqlMessager("PaperLinks") self.libraries = [] def crawl(self): """ crawl information from page @param self Pointer to class """ import time import random #self.mm.clear_table() sql = "SELECT PaperNames.PaperName,PaperNames.Paper_ID, Persons.FirstName, LastName FROM PaperNames inner join Persons on PaperNames.P_ID = Persons.ID" self.mm.execute_sql(sql) iter = self.mm.fetch() for row in iter: if row[1] > 795: self.name = row[0] url_name = '+'.join(self.name.split(' ')) url = "http://scholar.google.fi/scholar?as_q=%s&as_occt=title&hl=en" % url_name print url try: soup = self._downloader(url) self._parse_and_store(soup, row[1]) except Exception, e: print e time.sleep(random.randint(60, 130))
contents = None if "pdf" in paper_link: try: from urllib import urlretrieve urlretrieve(paper_link, path_name) contents = self.pdf_abstract_extractor(path_name) except Exception, e: stderr.write("Error: %s .\n" % e) pass else: try: processed_link = paper_link.split("/") web_site = processed_link[2] contents = self._website_extractor.website_extractor( web_site, paper_link) print "Downloaded: [ link id: %s, paper_id: %s. (link: %s)]." % ( link_id, paper_id, paper_link) except Exception, e: stderr.write( "Error: %s, link id: %s, paper id: %s. (link: %s). \n" % (e, link_id, paper_id, paper_link)) return contents if __name__ == "__main__": mysql_db = MysqlMessager() abstract_extractor = AbstractExtractor(mysql_db) abstract_extractor.download_data()
def __init__(self): """ Set up mysql messenger @param self Pointer to class """ super(NameCrawler, self).__init__() self.mm = MysqlMessager("Persons")
def __init__(self): super(AbstractCrawler, self).__init__() self.mm = MysqlMessager("CourseDescription") self.libraries = []
class PaperNameCrawler(AbstractCrawler): """ Class : PaperNameCrawler Description: Paper Name crawler crawls the name of paper from Tuhat Database of University of Helsinki given name of the author. """ def __init__(self): """ Set up mysql messenger @param self Pointer to class """ super(PaperNameCrawler, self).__init__() self.mm = MysqlMessager("PaperNames") def crawl(self): """ crawl information from page @param self Pointer to class """ self.mm.clear_table() sql = "SELECT * FROM Persons" self.mm.execute_sql(sql) iter = self.mm.fetch() for row in iter: url = row[3] soup = self._downloader(url) self._parse_and_store(soup, row[0]) def _downloader(self, url, out_folder="doc/"): """ Download the web page and store it python data structure @param self Pointer to class @param url URL to be downloaded @param out_folder Folder that stores information """ "" return super(PaperNameCrawler, self)._downloader(url) def _parse_and_store(self, soup, foreign_key): def doi2url(doi): """ Return a bibTeX string of metadata for a given DOI. ##TODO: Not working for now """ try: link = urlopen(doi).geturl() except Exception, e: # Error occured while resolving doi address print "Exception happened while processing doi: %s" % e link = doi print link return link super(PaperNameCrawler, self)._parse_and_store(soup) print self.log_dir log_file = codecs.open( self.log_dir + "paper_name_crawler_log_file.txt", "w", "utf-8") for p in soup.findAll('p', {'class': 'uh_relationlist'}): inner_soup = self._downloader(p.a['href']) if 'publications.html' == p.a['href'].split('/')[-1]: for inner_link in inner_soup.findAll('h2', {'class': 'title'}): paper_names = inner_link.a.span.contents[0] paper_link = inner_link.a['href'] paper_soup = self._downloader(paper_link) paper_out_link_resolved = "default" try: doi = paper_soup.findAll( 'ul', {'class': 'relations digital_object_identifiers' })[0].li.a['href'] paper_out_link_resolved = u"\"" + doi2url(doi) + u"\"" #paper_out_link_resolved = doi2url(doi) time.sleep(60) except: # can not find any links in the web page try: # then tries to find whether there is any link connects to the paper for h in paper_soup.findAll( 'h3', {'class': 'subheader'}): if h.contents[0] == "Links": print "Links" paper_out_link_resolved = u"\"" + h.parent.ul.li.a[ 'href'] + u"\"" except: # can not find links either pass sql = u"INSERT INTO PaperNames (Paper_ID, PaperName, Link, P_ID) VALUES ( default,\"" +\ paper_names + u"\", " + paper_out_link_resolved + u", " + str(foreign_key) + u")" print sql self.mm.execute_sql(sql, log_file) log_file.close()