def crawl_subject(short_url, only_torrent=False, logfile=sys.stdout): url = "%s%s" % (domain, short_url) content = trunk.open_page(url) soup = BeautifulSoup(content) sps = soup('span', class_='bold', text=page_pattern) if len(sps) != 1: logfile.write("Error: can't find the title!\n") return False, "Can't find the title" mc = sps[0].find_next_siblings('div') if len(mc) != 1: logfile.wrire("Error: There's more than one div!\n") return False, "More than one div" if only_torrent: logfile.write(mc.string.encode('gbk')) for dpage in soup('a', href=href_pattern): download_torrent(dpage['href'], logfile) return True, sps[0].string.encode('gbk') dir_seq = 1 os.mkdir(str(dir_seq)) os.chdir(str(dir_seq)) for child in mc[0].descendants: if isinstance(child, element.NavigableString): logfile.write(child.encode('gbk')) elif isinstance(child, element.Tag): if child.name == 'br': logfile.write('\n') elif child.name == 'img': trunk.download(child['src'], logfile=logfile) elif child.name == 'a' and href_pattern.search(child['href']): fn = download_torrent(child['href'], logfile) logfile.write('Write the file %s\n' % fn) dir_seq += 1 os.chdir('..') os.mkdir(str(dir_seq)) os.chdir(str(dir_seq)) else: logfile.write('child type error!!!') os.chdir('..') os.rmdir(str(dir_seq)) return True, sps[0].string.encode('gbk')
def download_torrent(url, logfile=sys.stderr): content = trunk.open_page(url) soup = BeautifulSoup(content) form = soup2.find('form') # durl = posixpath.normpath(posixpath.join(posixpath.dirname(url), form['action'])) durl = urlparse.urljoin(url, form['action']) datas = form('input', {'type':'hidden'}) data = {} for item in datas: data[item['name']] = item['value'].encode('utf8') postdata = urllib.urlencode(data) print postdata, len(postdata) hd = trunk.header.copy() hd.update({ 'Content-Type': 'application/x-www-form-urlencoded', 'Content-Length': len(postdata), 'Referer': str(url), }) return trunk.download(durl, postdata, hd, logfile=logfile)
pathquery = 'forumdisplay.php?fid=21&page=' header = trunk.header fid_id = {'weimei': 13, 'zipai':9, 'oumei':10} # for picture pathquery = pathquery.replace('21', str(fid_id['oumei']) #page_pattern = re.compile(ur'\[\d{2}-\d{2}\]') href_pattern = re.compile(ur'viewthread\.php\?tid=\d+.*extra=page%3D1$') def crawl_subject(short_url, with_jpg=True, logfile=sys.stdout): url = "%s%s" % (domain, short_url) content = trunk.open_page(url) soup = BeautifulSoup(content) for img in soup('img', onclick=True): if trunk.download(img['src'], logfile=logfile) == 'existed': break def crawl_content(content, clf=sys.stdout): soup = BeautifulSoup(content) for a in reversed(soup('a', href=href_pattern, title=None, style=None)): # clf.write('%s\n' % a.encode('gbk')) print a.encode('gbk') now = str(time.time()) os.mkdir(now) os.chdir(now) crawl_subject(a['href'], logfile=clf) os.chdir('..') def crawl_page(page_id=1, clf=sys.stdout): content = trunk.open_page('%s%s%d' % (domain, pathquery, page_id))