Esempio n. 1
0
    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
Esempio n. 2
0
    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = [
            "d((eliverables?)|[0-9])",
            "documents?",
            "reports?",
            "public(ation)?s?",
            "results?",
            "presentations?",
            "library",
            #"projects?",
            "outocomes?",
            "downloads?",
            "outputs?"
        ]

        if addkeyw != None:
            self._sigwords.append(addkeyw)
        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = {url: [0, 0, 0]}

        self.base_url = url  # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0  # index/noindex/frame/special
        self.VISIT = 1  # unvisited/visited
        self.RANK = 2  # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        #set debug flag
        self.__dbg__ = debug

        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")
Esempio n. 3
0
    def __init__(self, options=opt, url=None):
        # get options
        self.opt = options
        if url != None:
            self.opt_url = url
        else:
            self.opt_url = self.opt.url

        # initialize main html handler and parser
        self.htmlhandler = GetHTMLAndParse()

        # searching deliverable page
        self.pagesearch = GetDelivPage(self.opt_url,
                                       verbose=self.opt.verbose,
                                       debug=self.opt.debug,
                                       addkeyw=self.opt.regexp)

        # extracting informations from page
        self.recordhandler = GetDelivRecords(debug=self.opt.debug)
 def __init__(self, verbose=False, debug=False):
     # init agent for parsing html
     self.htmlHandler = GetHTMLAndParse()
     # to get region where to search for records
     self.regionHandler = GetDeliverableRegion()
     # init text formatter (encoding, erasing white chars etc.)
     self.formatter = TextFormatUtils()
     # list of acceptable words in title (header) of table
     self.table_sem_words = [
         'deliverable', 'description', 'name', 'date', 'dissemination',
         'no.', 'wp', 'delivery', 'particip', 'title', 'nature'
     ]
     self._omitted_tags = ('br', 'img', 'html', 'body')
     # tag tolerance
     self.tagtol = 1
     # verbose and debug flags
     self.debugger = DeliverableDebugger(verbose=verbose, debug=debug)
     self.__verbose = self.debugger.verbose
     self.__debug = self.debugger.debug
Esempio n. 5
0
    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()