def __init__(self, base_url): self.canon = Canonicalizer(base_url) self.in_body = False self.clean_once = False self.is_title = False self.text_lines = [] self.title = '' self.links = []
class ParserTarget(object): def __init__(self, base_url): self.canon = Canonicalizer(base_url) self.in_body = False self.clean_once = False self.is_title = False self.text_lines = [] self.title = '' self.links = [] def start(self, tag, attribute): if tag == 'body': self.in_body = True if self.in_body: self.clean_once = not (tag == 'link' or tag == 'script' or tag == 'style') if tag == 'title': self.is_title = True if self.in_body and tag == 'a' and attribute.has_key('href') and len( attribute['href']) > 3: self.links.append(self.canon.norms(attribute['href'])) def end(self, tag): if tag == 'body': self.in_body = False def data(self, data): d = data.strip() if self.is_title and d: self.title = d self.is_title = False if self.clean_once and d: self.text_lines.append(d) def comment(self, text): pass def close(self): pass
class ParserTarget(object): def __init__(self, base_url): self.canon = Canonicalizer(base_url) self.in_body = False self.clean_once = False self.is_title = False self.text_lines = [] self.title = '' self.links = [] def start(self, tag, attribute): if tag == 'body': self.in_body = True if self.in_body: self.clean_once = not (tag == 'link' or tag == 'script' or tag == 'style') if tag == 'title': self.is_title = True if self.in_body and tag == 'a' and attribute.has_key('href') and len(attribute['href']) > 3: self.links.append(self.canon.norms(attribute['href'])) def end(self, tag): if tag == 'body': self.in_body = False def data(self, data): d = data.strip() if self.is_title and d: self.title = d self.is_title = False if self.clean_once and d: self.text_lines.append(d) def comment(self, text): pass def close(self): pass