def magic_fetch_and_insert(self): headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-cn,zh;q=0.5', 'Connection': 'keep-alive', 'Cookie': '37cs_user=37cs13650143833; Hm_lvt_ce396a90f02f136fc25a1bfc9138c834=1331695718366; 37cs_show=1%2C26; PHPSESSID=r3okpuu9o47bt9u32epkii08i2; 37cs_pidx=4; Hm_lpvt_ce396a90f02f136fc25a1bfc9138c834=1331695718366', 'Host': 'oabt.org', 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'} req = urllib2.Request(self.url, headers=headers) doc = urllib2.urlopen(req, timeout = 10).read() page_soup = BeautifulSoup.BeautifulSoup(doc) tables = page_soup.findAll('table', cellspacing="0") tables = tables[3].contents[4:] for tr in tables: size = tr.contents[1].contents[9].contents[0] #size typeL2 = tr.contents[1].contents[1].contents[0].contents[0] #type typeL1 = 'Video' if typeL2 == u'泰剧': typeL2 = 'PC Games' typeL1 = 'Games' resource_name = tr.contents[1].contents[3].contents[1].contents[0] #name magnet = tr.contents[1].contents[5].contents[1]['href'] #magnet ed2k = tr.contents[1].contents[5].contents[2]['ed2k'] #ed2k try: util_db.insert('all_resource', resource_name = resource_name, typeL1 = typeL1, typeL2 = typeL2, magnet = magnet, size = size, hotrank = hotrank_oabt_weighted, extern_info = 'False', language = 'CH', ed2k = ed2k) except: print "insert Err" print 'OK'
def fetch(url, dbname=alldbname): """ 根据指定的url抓取资源信息,存到数据库中 此页面必须是直接有链接的页面 """ try: doc = urllib2.urlopen(url, timeout=10) except: print 'open url Err, url:%s' % (url) return try: soup = BeautifulSoup.BeautifulSoup(doc.read()) souptrs = BeautifulSoup.BeautifulSoup(str(soup.findAll('tr'))) except: print 'BeautifulSoup Err' return for tr in souptrs.contents[2:]: if hasattr(tr, 'name'): #获取资源名称,类别,链接地址及大小 i = 0 try: acollect = tr.findAll('a') typeL1 = ''.join(acollect[0].contents) typeL2 = ''.join(acollect[1].contents) #改一下分类名 if typeL2 == 'PC' and typeL1 == 'Games': typeL2 = 'PC Games' name = ''.join(acollect[2].contents) magnet = acollect[3]['href'] font = tr.findAll('font') sizelazy = ''.join(font[0].contents[0]) #获取大小,不用费心看了,严重依赖于格式 size = sizelazy[sizelazy.find('Size') + 5:sizelazy.find('iB') + 2].replace(ur' ', '') #判定hotrank hotrank = hotrank_tpb_weighted if url.find('top') > 0: hotrank += hotrank_top_weighted print "name:%s, typeL1:%s, typeL2:%s, size:%s" % (name, typeL1, typeL2, size) util_db.insert('all_resource', resource_name=name, typeL1=typeL1, typeL2=typeL2, magnet=magnet, size=size, hotrank=hotrank, extern_info='False', language='EN', ed2k='') except: i = i + 1 print 'fetch resouce url Err, url:%s' % (url) if i > 3: break
def fetch(url, dbname = alldbname): """ 根据指定的url抓取资源信息,存到数据库中 此页面必须是直接有链接的页面 """ try: doc = urllib2.urlopen(url, timeout=10) except: print 'open url Err, url:%s' % (url) return try: soup = BeautifulSoup.BeautifulSoup(doc.read()) souptrs = BeautifulSoup.BeautifulSoup(str(soup.findAll('tr'))) except: print 'BeautifulSoup Err' return i=0 for tr in souptrs.contents[2:]: i = i+1 #print i if i > 30: break if hasattr(tr, 'name'): #获取资源名称,类别,链接地址及大小 try: acollect = tr.findAll('a') typeL1 = ''.join(acollect[0].contents) typeL2 = ''.join(acollect[1].contents) #改一下分类名 if typeL2 == 'PC' and typeL1 == 'Games': typeL2 = 'PC Games' name = ''.join(acollect[2].contents) magnet = acollect[3]['href'] font = tr.findAll('font') sizelazy = ''.join(font[0].contents[0]) #获取大小,不用费心看了,严重依赖于格式 size = sizelazy[sizelazy.find('Size') + 5:sizelazy.find('iB') + 2].replace(ur' ', '') #判定hotrank hotrank = hotrank_tpb_weighted if url.find('top') > 0: hotrank += hotrank_top_weighted print "name:%s, typeL1:%s, typeL2:%s, size:%s" % (name, typeL1, typeL2, size) util_db.insert('all_resource', resource_name = name, typeL1 = typeL1, typeL2 = typeL2, magnet = magnet, size = size, hotrank = hotrank, extern_info = 'False', language = 'EN', ed2k = '') except Exception as inst: #i = i + 1 print 'fetch resouce url Err, url:%s' % (url) print inst
def magic_fetch_and_insert(self): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-cn,zh;q=0.5", "Connection": "keep-alive", "Cookie": "37cs_user=37cs13650143833; Hm_lvt_ce396a90f02f136fc25a1bfc9138c834=1331695718366; 37cs_show=1%2C26; PHPSESSID=r3okpuu9o47bt9u32epkii08i2; 37cs_pidx=4; Hm_lpvt_ce396a90f02f136fc25a1bfc9138c834=1331695718366", "Host": "oabt.org", "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2", } req = urllib2.Request(self.url, headers=headers) doc = urllib2.urlopen(req, timeout=10).read() page_soup = BeautifulSoup.BeautifulSoup(doc) tables = page_soup.findAll("table", cellspacing="0") tables = tables[3].contents[4:] for tr in tables: size = tr.contents[1].contents[9].contents[0] # size typeL2 = tr.contents[1].contents[1].contents[0].contents[0] # type typeL1 = "Video" if typeL2 == u"泰剧": typeL2 = "PC Games" typeL1 = "Games" resource_name = tr.contents[1].contents[3].contents[0].contents[0] # name magnet = tr.contents[1].contents[5].contents[1]["href"] # magnet ed2k = tr.contents[1].contents[5].contents[2]["ed2k"] # ed2k try: util_db.insert( "all_resource", resource_name=resource_name, typeL1=typeL1, typeL2=typeL2, magnet=magnet, size=size, hotrank=hotrank_oabt_weighted, extern_info="False", language="CH", ed2k=ed2k, ) except: print "insert Err" print "OK"