def __init__(self, verbose=False, debug=False): self.__dbg__ = debug self.__verbos = verbose self._crawler = Crawler() self._crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) self._unwanted_titles = ['Download here', 'PDF format'] self._records = [] ################################ #manual processing self.agent = GetHTMLAndParse() # to get region where to search for records self.regionHandler = GetDeliverableRegion() # init text formatter (encoding, erasing white chars etc.) self.formatter = TextFormatUtils() self._omitted_tags = ('br', 'img', 'html', 'body') # tag tolerance self.tagtol = 1
def __init__(self, headercoef=2.5): """ Constructor. @param headercoef: lower border of elemet's visibility to be handled as header """ self.generalizer = _RRSPropertyGeneralizer() self.ee = EntityExtractor() self.headercoef = headercoef self.bibtexparser = BibTeXParser() self.crawler = Crawler() self.mime_handler = MIMEhandler() self.crawler.set_handler(FileDownloader)
def __init__(self, xmlcompatibility='db09'): self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30) self.citaextractor = CitationEntityExtractor( ALL, xmlcompatibility=xmlcompatibility) self.ee = EntityExtractor() self.mime = MIMEhandler() self.crawler = Crawler() self.bibtex = BibTeXParser() self.xmlcompatibility = xmlcompatibility self._xmlvalid = int(xmlcompatibility.lstrip('db')) self._publ_list = []
""" Returns xml in lxml.etree.ElementTree object. """ self._make_xml() return self.xmldocument # ------------------------------------------------------------------------------ # end of class HTMLSequenceWrapper # ------------------------------------------------------------------------------ if __name__ == '__main__': from rrslib.web.crawler import Crawler sp = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20) c = Crawler() #urls = ['http://www.ualberta.ca/~bhan/publ.htm'] #urls = ['http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1'] urls = [ 'http://kaminari.scitec.kobe-u.ac.jp/pub_en.html', 'http://www.cis.uab.edu/sprague/', 'http://www2.lifl.fr/~carle/old/mabib.htm', 'http://www.poli.usp.br/p/fabio.cozman/', 'http://www.cs.washington.edu/homes/weld/pubs.html', 'http://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=1&lang=1' ] #urls = ['https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=2&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=3&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=4&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=5&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=6&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=7&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=8&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=9&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=10&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=11&lang=1', 'https://www.vutbr.cz/index.php?page=obsah_publikace&wapp=portal&parent=3&tail=3&str=12&lang=1'] #urls = ['http://www.cs.princeton.edu/~schapire/publist.html'] #urls = ['http://bionum.cs.purdue.edu/p.html'] #urls = ['http://www.awissenet.eu/publications.aspx']