Example #1
0
    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
Example #2
0
 def __init__(self, headercoef=2.5):
     """
     Constructor.
     @param headercoef: lower border of elemet's visibility to be handled as header
     """
     self.generalizer = _RRSPropertyGeneralizer()
     self.ee = EntityExtractor()
     self.headercoef = headercoef
     self.bibtexparser = BibTeXParser()
     self.crawler = Crawler()
     self.mime_handler = MIMEhandler()
     self.crawler.set_handler(FileDownloader)
Example #3
0
 def __init__(self, xmlcompatibility='db09'):
     self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0,
                                           headercoef=3.0,
                                           mintextlen=30)
     self.citaextractor = CitationEntityExtractor(
         ALL, xmlcompatibility=xmlcompatibility)
     self.ee = EntityExtractor()
     self.mime = MIMEhandler()
     self.crawler = Crawler()
     self.bibtex = BibTeXParser()
     self.xmlcompatibility = xmlcompatibility
     self._xmlvalid = int(xmlcompatibility.lstrip('db'))
     self._publ_list = []
        """
        Returns xml in lxml.etree.ElementTree object.
        """
        self._make_xml()
        return self.xmldocument


# ------------------------------------------------------------------------------
# end of class HTMLSequenceWrapper
# ------------------------------------------------------------------------------

if __name__ == '__main__':
    from rrslib.web.crawler import Crawler
    sp = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)

    c = Crawler()
    #urls = ['http://www.ualberta.ca/~bhan/publ.htm']
    #urls = ['http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1']
    urls = [
        'http://kaminari.scitec.kobe-u.ac.jp/pub_en.html',
        'http://www.cis.uab.edu/sprague/',
        'http://www2.lifl.fr/~carle/old/mabib.htm',
        'http://www.poli.usp.br/p/fabio.cozman/',
        'http://www.cs.washington.edu/homes/weld/pubs.html',
        'http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1'
    ]
    #urls = ['https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=2&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=3&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=4&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=5&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=6&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=7&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=8&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=9&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=10&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=11&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=12&lang=1']

    #urls = ['http://www.cs.princeton.edu/~schapire/publist.html']
    #urls = ['http://bionum.cs.purdue.edu/p.html']
    #urls = ['http://www.awissenet.eu/publications.aspx']