Ejemplo n.º 1
0
class CourseContentCrawler(AbstractCrawler):
    def __init__(self):
        super(AbstractCrawler, self).__init__()
        self.mm = MysqlMessager("CourseDescription")
        self.libraries = []

    def _downloader(self, url, out_folder="doc/"):
        """ Download the web page and store it python data structure
        @param self Pointer to class
        @param url URL to be downloaded
        @param out_folder Folder that stores information
        """ ""
        hdr = {'User-Agent': 'Mozilla/5.0'}
        req = Request(url, headers=hdr)
        page = urlopen(req)
        soup = bs(page)
        return soup

    def crawl(self):
        """ crawl information from page
        @param self Pointer to class
        """
        self.mm.clear_table()
        url = "http://www.cs.helsinki.fi/en/courses?y=2014&s%5B%5D=K&l%5B%5D=E"
        try:
            soup = self._downloader(url)
            self._parse_and_store(soup)
        except Exception, e:
            print e
Ejemplo n.º 2
0
 def __init__(self):
     """ Set up mysql messenger
     @param self Pointer to class
     """
     super(PaperAbstractCrawler, self).__init__()
     self.mm = MysqlMessager("PaperLinks")
     self.libraries = []
Ejemplo n.º 3
0
class NameCrawler(AbstractCrawler):
    def __init__(self):
        """ Set up mysql messenger
        @param self Pointer to class
        """
        super(NameCrawler, self).__init__()
        self.mm = MysqlMessager("Persons")

    def __repr__(self):
        """ Show descriptions of this class
        @param self Pointer to class
        """
        return "<NameCrawler name:%s>" % self.name

    def _parse_and_store(self, soup):
        """ Parses the web page provided to crawl and store them in database
        @param self Pointer to class
        @param soup Structured data to be parsed
        """
        super(NameCrawler, self)._parse_and_store(soup)
        log_file = codecs.open(self.log_dir + "name_crawler_log_file.txt", "w",
                               "utf-8")
        self.mm.clear_table()
        for link in soup.findAll("a"):
            if link.has_key('rel') and 'Person' == link['rel']:
                names = link.span.contents[0].split(',')
                href = link['href']
                sql = u"INSERT INTO Persons (ID, FirstName, LastName, Link) VALUES (default, \"" + names[0]+u"\",\"" +\
                      names[1] + u"\",\"" + href + u"\")"
                self.mm.execute_sql(sql, log_file)
        log_file.close()

    def _downloader(self, url, out_folder="doc/"):
        """ Download the web page and store it python data structure
        @param self Pointer to class
        @param url URL to be downloaded
        @param out_folder Folder that stores information
        """ ""
        return super(NameCrawler, self)._downloader(url)

    def crawl(self, url):
        """ crawl informations from page
        @param self Pointer to class
        @param url URL to be downloaded
        """
        soup = self._downloader(url)
        self._parse_and_store(soup)
Ejemplo n.º 4
0
class PaperAbstractCrawler(AbstractCrawler):
    """
    Class :  PaperNameCrawler
    Description: Paper Name crawler crawls the name of paper from Tuhat Database of University of Helsinki given name
    of the author.
    """
    def __init__(self):
        """ Set up mysql messenger
        @param self Pointer to class
        """
        super(PaperAbstractCrawler, self).__init__()
        self.mm = MysqlMessager("PaperLinks")
        self.libraries = []

    def crawl(self):
        """ crawl information from page
        @param self Pointer to class
        """
        import time
        import random
        #self.mm.clear_table()
        sql = "SELECT PaperNames.PaperName,PaperNames.Paper_ID, Persons.FirstName, LastName FROM PaperNames inner join Persons on PaperNames.P_ID = Persons.ID"
        self.mm.execute_sql(sql)
        iter = self.mm.fetch()
        for row in iter:
            if row[1] > 795:
                self.name = row[0]
                url_name = '+'.join(self.name.split(' '))
                url = "http://scholar.google.fi/scholar?as_q=%s&as_occt=title&hl=en" % url_name
                print url
                try:
                    soup = self._downloader(url)
                    self._parse_and_store(soup, row[1])
                except Exception, e:
                    print e
                time.sleep(random.randint(60, 130))
Ejemplo n.º 5
0
        contents = None

        if "pdf" in paper_link:
            try:
                from urllib import urlretrieve
                urlretrieve(paper_link, path_name)
                contents = self.pdf_abstract_extractor(path_name)
            except Exception, e:
                stderr.write("Error: %s .\n" % e)
            pass
        else:

            try:
                processed_link = paper_link.split("/")
                web_site = processed_link[2]
                contents = self._website_extractor.website_extractor(
                    web_site, paper_link)
                print "Downloaded: [ link id: %s, paper_id: %s. (link: %s)]." % (
                    link_id, paper_id, paper_link)
            except Exception, e:
                stderr.write(
                    "Error: %s, link id: %s, paper id: %s. (link: %s). \n" %
                    (e, link_id, paper_id, paper_link))
        return contents


if __name__ == "__main__":
    mysql_db = MysqlMessager()
    abstract_extractor = AbstractExtractor(mysql_db)
    abstract_extractor.download_data()
Ejemplo n.º 6
0
 def __init__(self):
     """ Set up mysql messenger
     @param self Pointer to class
     """
     super(NameCrawler, self).__init__()
     self.mm = MysqlMessager("Persons")
Ejemplo n.º 7
0
 def __init__(self):
     super(AbstractCrawler, self).__init__()
     self.mm = MysqlMessager("CourseDescription")
     self.libraries = []
Ejemplo n.º 8
0
class PaperNameCrawler(AbstractCrawler):
    """
    Class :  PaperNameCrawler
    Description: Paper Name crawler crawls the name of paper from Tuhat Database of University of Helsinki given name
    of the author.
    """
    def __init__(self):
        """ Set up mysql messenger
        @param self Pointer to class
        """
        super(PaperNameCrawler, self).__init__()
        self.mm = MysqlMessager("PaperNames")

    def crawl(self):
        """ crawl information from page
        @param self Pointer to class
        """
        self.mm.clear_table()
        sql = "SELECT * FROM Persons"
        self.mm.execute_sql(sql)
        iter = self.mm.fetch()
        for row in iter:
            url = row[3]
            soup = self._downloader(url)
            self._parse_and_store(soup, row[0])

    def _downloader(self, url, out_folder="doc/"):
        """ Download the web page and store it python data structure
        @param self Pointer to class
        @param url URL to be downloaded
        @param out_folder Folder that stores information
        """ ""
        return super(PaperNameCrawler, self)._downloader(url)

    def _parse_and_store(self, soup, foreign_key):
        def doi2url(doi):
            """
            Return a bibTeX string of metadata for a given DOI.
            ##TODO: Not working for now
            """
            try:
                link = urlopen(doi).geturl()
            except Exception, e:
                # Error occured while resolving doi address
                print "Exception happened while processing doi: %s" % e
                link = doi
            print link
            return link

        super(PaperNameCrawler, self)._parse_and_store(soup)
        print self.log_dir
        log_file = codecs.open(
            self.log_dir + "paper_name_crawler_log_file.txt", "w", "utf-8")
        for p in soup.findAll('p', {'class': 'uh_relationlist'}):
            inner_soup = self._downloader(p.a['href'])
            if 'publications.html' == p.a['href'].split('/')[-1]:
                for inner_link in inner_soup.findAll('h2', {'class': 'title'}):
                    paper_names = inner_link.a.span.contents[0]
                    paper_link = inner_link.a['href']
                    paper_soup = self._downloader(paper_link)
                    paper_out_link_resolved = "default"
                    try:
                        doi = paper_soup.findAll(
                            'ul',
                            {'class': 'relations digital_object_identifiers'
                             })[0].li.a['href']
                        paper_out_link_resolved = u"\"" + doi2url(doi) + u"\""
                        #paper_out_link_resolved =  doi2url(doi)
                        time.sleep(60)
                    except:
                        # can not find any links in the web page
                        try:
                            # then tries to find whether there is any link connects to the paper
                            for h in paper_soup.findAll(
                                    'h3', {'class': 'subheader'}):
                                if h.contents[0] == "Links":
                                    print "Links"
                                    paper_out_link_resolved = u"\"" + h.parent.ul.li.a[
                                        'href'] + u"\""
                        except:
                            # can not find links either
                            pass
                    sql = u"INSERT INTO PaperNames (Paper_ID, PaperName, Link, P_ID) VALUES ( default,\"" +\
                          paper_names + u"\", " + paper_out_link_resolved + u", " + str(foreign_key) + u")"
                    print sql
                    self.mm.execute_sql(sql, log_file)
        log_file.close()