Ejemplo n.º 1
0
 def __init__(self, base_url):
     self.canon = Canonicalizer(base_url)
     self.in_body = False
     self.clean_once = False
     self.is_title = False
     self.text_lines = []
     self.title = ''
     self.links = []
Ejemplo n.º 2
0
 def __init__(self, base_url):
     self.canon = Canonicalizer(base_url)
     self.in_body = False
     self.clean_once = False
     self.is_title = False
     self.text_lines = []
     self.title = ''
     self.links = []
Ejemplo n.º 3
0
class ParserTarget(object):
    def __init__(self, base_url):
        self.canon = Canonicalizer(base_url)
        self.in_body = False
        self.clean_once = False
        self.is_title = False
        self.text_lines = []
        self.title = ''
        self.links = []

    def start(self, tag, attribute):
        if tag == 'body':
            self.in_body = True
        if self.in_body:
            self.clean_once = not (tag == 'link' or tag == 'script'
                                   or tag == 'style')
        if tag == 'title':
            self.is_title = True
        if self.in_body and tag == 'a' and attribute.has_key('href') and len(
                attribute['href']) > 3:
            self.links.append(self.canon.norms(attribute['href']))

    def end(self, tag):
        if tag == 'body':
            self.in_body = False

    def data(self, data):
        d = data.strip()
        if self.is_title and d:
            self.title = d
            self.is_title = False
        if self.clean_once and d:
            self.text_lines.append(d)

    def comment(self, text):
        pass

    def close(self):
        pass
Ejemplo n.º 4
0
class ParserTarget(object):

    def __init__(self, base_url):
        self.canon = Canonicalizer(base_url)
        self.in_body = False
        self.clean_once = False
        self.is_title = False
        self.text_lines = []
        self.title = ''
        self.links = []

    def start(self, tag, attribute):
        if tag == 'body':
            self.in_body = True
        if self.in_body:
            self.clean_once = not (tag == 'link' or tag == 'script' or tag == 'style')
        if tag == 'title':
            self.is_title = True
        if self.in_body and tag == 'a' and attribute.has_key('href') and len(attribute['href']) > 3:
            self.links.append(self.canon.norms(attribute['href']))

    def end(self, tag):
        if tag == 'body':
            self.in_body = False

    def data(self, data):
        d = data.strip()
        if self.is_title and d:
            self.title = d
            self.is_title = False
        if self.clean_once and d:
            self.text_lines.append(d)

    def comment(self, text):
        pass

    def close(self):
        pass