Python Crawler.Crawler Examples

Programming Language: Python

Namespace/Package Name: rrslib.web.crawler

Class/Type: Crawler

Method/Function: Crawler

Examples at hotexamples.com: 4

Python Crawler.Crawler - 4 examples found. These are the top rated real world Python examples of rrslib.web.crawler.Crawler.Crawler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

start(5)

Crawler(4)

set_headers(2)

set_handler(1)

Example #1

Show file

    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1

Example #2

Show file

 def __init__(self, headercoef=2.5):
     """
     Constructor.
     @param headercoef: lower border of elemet's visibility to be handled as header
     """
     self.generalizer = _RRSPropertyGeneralizer()
     self.ee = EntityExtractor()
     self.headercoef = headercoef
     self.bibtexparser = BibTeXParser()
     self.crawler = Crawler()
     self.mime_handler = MIMEhandler()
     self.crawler.set_handler(FileDownloader)

Example #3

Show file

 def __init__(self, xmlcompatibility='db09'):
     self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0,
                                           headercoef=3.0,
                                           mintextlen=30)
     self.citaextractor = CitationEntityExtractor(
         ALL, xmlcompatibility=xmlcompatibility)
     self.ee = EntityExtractor()
     self.mime = MIMEhandler()
     self.crawler = Crawler()
     self.bibtex = BibTeXParser()
     self.xmlcompatibility = xmlcompatibility
     self._xmlvalid = int(xmlcompatibility.lstrip('db'))
     self._publ_list = []

Example #4

Show file

File: sequencewrapper.py Project: lucidvoci/ResearchProjectPortal

        """
        Returns xml in lxml.etree.ElementTree object.
        """
        self._make_xml()
        return self.xmldocument


# ------------------------------------------------------------------------------
# end of class HTMLSequenceWrapper
# ------------------------------------------------------------------------------

if __name__ == '__main__':
    from rrslib.web.crawler import Crawler
    sp = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)

    c = Crawler()
    #urls = ['http://www.ualberta.ca/~bhan/publ.htm']
    #urls = ['http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1']
    urls = [
        'http://kaminari.scitec.kobe-u.ac.jp/pub_en.html',
        'http://www.cis.uab.edu/sprague/',
        'http://www2.lifl.fr/~carle/old/mabib.htm',
        'http://www.poli.usp.br/p/fabio.cozman/',
        'http://www.cs.washington.edu/homes/weld/pubs.html',
        'http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1'
    ]
    #urls = ['https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=2&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=3&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=4&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=5&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=6&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=7&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=8&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=9&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=10&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=11&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=12&lang=1']

    #urls = ['http://www.cs.princeton.edu/~schapire/publist.html']
    #urls = ['http://bionum.cs.purdue.edu/p.html']
    #urls = ['http://www.awissenet.eu/publications.aspx']