def sort(self, pages): file_type = url2filetype(self.url) or '' prefix = self.url[:-len(file_type) - 1] if re.match('[_\-]\d$', prefix): prefix = self.url[:-len(file_type) - 3] prefix_len = len(prefix) res = {} for url in pages: num = re.search('\d+', url[prefix_len:-len(file_type) - 1]) res[url] = int(num.group(0)) if num else 0 return [x[0] for x in sorted(res.iteritems(), key=lambda x: x[1])]
def clean_bads(self): for node in tags(self.doc, 'form', 'iframe', 'textarea', 'input'): if node != self.doc: self.drop(node) jpgs = 'jpg|jpeg|png|gif|bmp'.split('|') for node in tags(self.doc, 'img', 'a'): if node.tag == 'img': width = to_int(node.get('width')) height = to_int(node.get('height')) src = node.get('src', '') if not src.startswith('http://') \ or 'themes' in src \ or (url2filetype(src) or '').lower() not in jpgs \ or width is not None and height is not None \ and (width < 200 and height < 160 or width < 160 or height < 40): self.drop(node) elif node.tag == 'a' and not node.get('href', '').startswith('http://'): self.drop(node)
def get_pages(self): file_type = url2filetype(self.url) if not file_type: return [] pages = set([self.url]) prefix = self.url[:-len(file_type) - 1] if re.match('.*[_\-]\d$', prefix): prefix = self.url[:-len(file_type) - 3] prefix_len = len(prefix) for node in self.doc.iter('a'): href = node.get('href').strip() if node.get('href') else None if href and len(href) > prefix_len + 2 \ and href[:prefix_len] == prefix: href = href.split('#')[0].split('?')[0] pages.add(href) if len(pages) > 1: return list(pages) return []
def parse(self): file_type = url2filetype(self.article.url) if not file_type: return {'urls': []} pages = set([self.article.url]) prefix = self.article.url[:-len(file_type) - 1] if re.match('.*[_\-]\d$', prefix): prefix = self.article.url[:-len(file_type) - 3] prefix_len = len(prefix) for node in self.article.doc.iter('a'): href = node.get('href').strip() if node.get('href') else None if href and len(href) > prefix_len + 2 \ and href[:prefix_len] == prefix: href = href.split('#')[0].split('?')[0] pages.add(href) pages = list(pages) if len(pages) == 1: pages = [] return {'urls': pages}