Python GetHTMLPageの例、rrslib.web.crawler.GetHTMLPage Pythonの例

コード例 #1

0

ファイルを表示

ファイル: gethtmlandparse.py プロジェクト: lucidvoci/ResearchProjectPortal

    def __init__(self):
        self.crawler = GetHTMLPage()
        self.crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self.mime_handler = MIMEHandler()

        # define wanted/unwanted file types
        self.wanted_mimes = [
            'application/pdf', 'application/msword', 'text/rtf'
            'application/postscript', 'octet/stream',
            'application/vnd.oasis.opendocument.text'
        ]
        self.unwanted_mimes = [
            'application/zip', 'application/x-tar', 'application/x-gtar'
        ]

コード例 #2

0

ファイルを表示

ファイル: gethtmlandparse.py プロジェクト: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

    def __init__(self):
        self.crawler = GetHTMLPage()
        self.crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self.mime_handler = MIMEHandler()

        # define wanted/unwanted file types
        self.wanted_mimes = ['application/pdf','application/msword', 'text/rtf'
                           'application/postscript', 'octet/stream',
                           'application/vnd.oasis.opendocument.text']
        self.unwanted_mimes = ['application/zip','application/x-tar',
                             'application/x-gtar']

コード例 #3

0

ファイルを表示

ファイル: gethtmlandparse.py プロジェクト: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

class GetHTMLAndParse:

    # init like init
    def __init__(self):
        self.crawler = GetHTMLPage()
        self.crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self.mime_handler = MIMEHandler()

        # define wanted/unwanted file types
        self.wanted_mimes = ['application/pdf','application/msword', 'text/rtf'
                           'application/postscript', 'octet/stream',
                           'application/vnd.oasis.opendocument.text']
        self.unwanted_mimes = ['application/zip','application/x-tar',
                             'application/x-gtar']

    
    """ Get Html And Parse page identified by url and remember it """
    def ghap(self, url):
        self._current_tree = None
        ############
        # open URL #
        try:
            _res = self.crawler.get_page(url)
            if _res[0] == -1:
                self._current_tree = -1
                return (-1,_res[1])

            else:
                self._current_tree = self.crawler.get_etree()
        except:
            return (-1, 'Downloading page interrupted.')
        # successful return        
        return (1, 'OK')

    def is_wanted_mime(self,link):
        "Test if mime type of link is in wanted types for deliverables documents"

        res = self.get_content_type(link)

        if not res:
            return False

        if res in self.wanted_mimes:
           return True
        else:
           return False

    def is_unwanted_mime(self,link):
        res = self.get_content_type(link)

        if not res:
            return True

        if res in self.unwanted_mimes:
           return True
        else:
           return False

    def is_page(self,link):
        res = self.get_content_type(link)

        if res == "text/html":
            return True
        else:
            return False

    
    """ Returns MIME type of content """
    def get_content_type(self, url=None):
        # returns MIME type of current page in GHAP if parameter url is None
        if url == None:
            return False
        res = self.mime_handler.start([url])
     

        if res == None:
            print "Chyba pri zistovani mime"
            return False
        else:
            return res[url]



        """ Compare two domain names from their URLs"""
    def compare_domains(self, right, left):
        rsplit = urlsplit(right)
        lsplit = urlsplit(left)
        # now we have two tuples of parsed URLs
        if re.match("(wiki\.|www\.)?" + rsplit[1], lsplit[1], re.I):
            return 1
        else:
            return 0


    """ Simple get domain name from URL """
    def get_domain_name(self, url):
        try: # use urlsplit function
            return urlsplit(url)[1]
        except:
            return None


    """ get, filter, edit anchors and return URLs
    if parameter regul is not None, returns URLs only from anchors that
        matches for REGEXP in regul
    if parameter base is not None, makes absolute URLs as mixure of base
        and link from anchor's href atribute """
    def get_all_links(self, regul=None, base=None):
        # get all anchors
        links = self._current_tree.findall('.//a[@href]')
        final = []
        for link in links:
            # all atributes and text together
            try:
                texts = link.text_content() + " " + " ".join(link.values())
            except:
                return list()
            # make links absolute
            if base is not None:
                link.make_links_absolute(base)
            # search in links
            if regul is not None:
                if regul.search(texts): # regul matches
                    final.append(link.get('href')) # get URL
            else:
                final.append(link.get('href'))
        return list(set(final)) # my little uniq


    """ Helper method for searching pagers """
    def get_pager_links(self, base=None):
        # get all anchors with href attribute
        links = self._current_tree.findall('.//a[@href]')
        final = []
        for link in links:
            text = lh.tostring(link, method='text', encoding=unicode)
            if base is not None:
                # make links absolute
                link.make_links_absolute(base)
            # search pager pattern
            if re.search('(^ ?[0-9]+ ?$)|(next)', text, re.I):
                final.append(link.get('href')) # get URL
        return list(set(final)) # my little uniq


    """ Get, filter and count header titles on one page """
    def count_all_headers(self, regul=None):
        # look for only first 3 levels of headers
        try:
            heads = self._current_tree.findall('//h1')
            heads.extend(self._current_tree.findall('//h2')) 
            heads.extend(self._current_tree.findall('//h3'))
        except AssertionError:
            return 0
        final = []
        # search in headers
        if regul is not None:
            for head in heads:
                try: 
                    if regul.search(head.text_content()): # regul matches
                        final.append(head.text_content()) # get URL
                except UnicodeDecodeError, err:
                    return (-1, 'Unicode decode error.')
        else:

コード例 #4

0

ファイルを表示

ファイル: gethtmlandparse.py プロジェクト: lucidvoci/ResearchProjectPortal

class GetHTMLAndParse:

    # init like init
    def __init__(self):
        self.crawler = GetHTMLPage()
        self.crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self.mime_handler = MIMEHandler()

        # define wanted/unwanted file types
        self.wanted_mimes = [
            'application/pdf', 'application/msword', 'text/rtf'
            'application/postscript', 'octet/stream',
            'application/vnd.oasis.opendocument.text'
        ]
        self.unwanted_mimes = [
            'application/zip', 'application/x-tar', 'application/x-gtar'
        ]

    """ Get Html And Parse page identified by url and remember it """

    def ghap(self, url):
        self._current_tree = None
        ############
        # open URL #
        try:
            _res = self.crawler.get_page(url)
            if _res[0] == -1:
                self._current_tree = -1
                return (-1, _res[1])

            else:
                self._current_tree = self.crawler.get_etree()
        except:
            return (-1, 'Downloading page interrupted.')
        # successful return
        return (1, 'OK')

    def is_wanted_mime(self, link):
        "Test if mime type of link is in wanted types for deliverables documents"

        res = self.get_content_type(link)

        if not res:
            return False

        if res in self.wanted_mimes:
            return True
        else:
            return False

    def is_unwanted_mime(self, link):
        res = self.get_content_type(link)

        if not res:
            return True

        if res in self.unwanted_mimes:
            return True
        else:
            return False

    def is_page(self, link):
        res = self.get_content_type(link)

        if res == "text/html":
            return True
        else:
            return False

    """ Returns MIME type of content """

    def get_content_type(self, url=None):
        # returns MIME type of current page in GHAP if parameter url is None
        if url == None:
            return False
        res = self.mime_handler.start([url])

        if res == None:
            print "Chyba pri zistovani mime"
            return False
        else:
            return res[url]
        """ Compare two domain names from their URLs"""

    def compare_domains(self, right, left):
        rsplit = urlsplit(right)
        lsplit = urlsplit(left)
        # now we have two tuples of parsed URLs
        if re.match("(wiki\.|www\.)?" + rsplit[1], lsplit[1], re.I):
            return 1
        else:
            return 0

    """ Simple get domain name from URL """

    def get_domain_name(self, url):
        try:  # use urlsplit function
            return urlsplit(url)[1]
        except:
            return None

    """ get, filter, edit anchors and return URLs
    if parameter regul is not None, returns URLs only from anchors that
        matches for REGEXP in regul
    if parameter base is not None, makes absolute URLs as mixure of base
        and link from anchor's href atribute """

    def get_all_links(self, regul=None, base=None):
        # get all anchors
        links = self._current_tree.findall('.//a[@href]')
        final = []
        for link in links:
            # all atributes and text together
            try:
                texts = link.text_content() + " " + " ".join(link.values())
            except:
                return list()
            # make links absolute
            if base is not None:
                link.make_links_absolute(base)
            # search in links
            if regul is not None:
                if regul.search(texts):  # regul matches
                    final.append(link.get('href'))  # get URL
            else:
                final.append(link.get('href'))
        return list(set(final))  # my little uniq

    """ Helper method for searching pagers """

    def get_pager_links(self, base=None):
        # get all anchors with href attribute
        links = self._current_tree.findall('.//a[@href]')
        final = []
        for link in links:
            text = lh.tostring(link, method='text', encoding=unicode)
            if base is not None:
                # make links absolute
                link.make_links_absolute(base)
            # search pager pattern
            if re.search('(^ ?[0-9]+ ?$)|(next)', text, re.I):
                final.append(link.get('href'))  # get URL
        return list(set(final))  # my little uniq

    """ Get, filter and count header titles on one page """

    def count_all_headers(self, regul=None):
        # look for only first 3 levels of headers
        try:
            heads = self._current_tree.findall('//h1')
            heads.extend(self._current_tree.findall('//h2'))
            heads.extend(self._current_tree.findall('//h3'))
        except AssertionError:
            return 0
        final = []
        # search in headers
        if regul is not None:
            for head in heads:
                try:
                    if regul.search(head.text_content()):  # regul matches
                        final.append(head.text_content())  # get URL
                except UnicodeDecodeError, err:
                    return (-1, 'Unicode decode error.')
        else: