Example #1
0
    def magic_fetch_and_insert(self):

        headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                   'Accept-Encoding': 'gzip, deflate',
                   'Accept-Language': 'zh-cn,zh;q=0.5',
                   'Connection': 'keep-alive',
                   'Cookie': '37cs_user=37cs13650143833; Hm_lvt_ce396a90f02f136fc25a1bfc9138c834=1331695718366; 37cs_show=1%2C26; PHPSESSID=r3okpuu9o47bt9u32epkii08i2; 37cs_pidx=4; Hm_lpvt_ce396a90f02f136fc25a1bfc9138c834=1331695718366',
                   'Host': 'oabt.org',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'}
        req = urllib2.Request(self.url, headers=headers)
        doc = urllib2.urlopen(req, timeout = 10).read()

        page_soup = BeautifulSoup.BeautifulSoup(doc)
        tables = page_soup.findAll('table', cellspacing="0")
        tables = tables[3].contents[4:]
        for tr in tables:
            size = tr.contents[1].contents[9].contents[0] #size
            typeL2 = tr.contents[1].contents[1].contents[0].contents[0] #type
            typeL1 = 'Video'
            if typeL2 == u'泰剧':
                typeL2 = 'PC Games'
                typeL1 = 'Games'
            resource_name = tr.contents[1].contents[3].contents[1].contents[0] #name
            magnet = tr.contents[1].contents[5].contents[1]['href'] #magnet
            ed2k = tr.contents[1].contents[5].contents[2]['ed2k'] #ed2k
            try:
                util_db.insert('all_resource', resource_name = resource_name,
                                                 typeL1 = typeL1, typeL2 = typeL2, magnet = magnet, size = size,
                                                 hotrank = hotrank_oabt_weighted, extern_info = 'False', language = 'CH', ed2k = ed2k)
            except:
                print "insert Err"
        print 'OK'
Example #2
0
def fetch(url, dbname=alldbname):
    """
        根据指定的url抓取资源信息,存到数据库中
        此页面必须是直接有链接的页面
    """
    try:
        doc = urllib2.urlopen(url, timeout=10)
    except:
        print 'open url Err, url:%s' % (url)
        return
    try:
        soup = BeautifulSoup.BeautifulSoup(doc.read())
        souptrs = BeautifulSoup.BeautifulSoup(str(soup.findAll('tr')))
    except:
        print 'BeautifulSoup Err'
        return

    for tr in souptrs.contents[2:]:
        if hasattr(tr, 'name'):
            #获取资源名称,类别,链接地址及大小
            i = 0
            try:
                acollect = tr.findAll('a')
                typeL1 = ''.join(acollect[0].contents)
                typeL2 = ''.join(acollect[1].contents)
                #改一下分类名
                if typeL2 == 'PC' and typeL1 == 'Games':
                    typeL2 = 'PC Games'
                name = ''.join(acollect[2].contents)
                magnet = acollect[3]['href']
                font = tr.findAll('font')
                sizelazy = ''.join(font[0].contents[0])
                #获取大小,不用费心看了,严重依赖于格式
                size = sizelazy[sizelazy.find('Size') + 5:sizelazy.find('iB') +
                                2].replace(ur' ', '')

                #判定hotrank
                hotrank = hotrank_tpb_weighted
                if url.find('top') > 0:
                    hotrank += hotrank_top_weighted

                print "name:%s, typeL1:%s, typeL2:%s, size:%s" % (name, typeL1,
                                                                  typeL2, size)
                util_db.insert('all_resource',
                               resource_name=name,
                               typeL1=typeL1,
                               typeL2=typeL2,
                               magnet=magnet,
                               size=size,
                               hotrank=hotrank,
                               extern_info='False',
                               language='EN',
                               ed2k='')
            except:
                i = i + 1
                print 'fetch resouce url Err, url:%s' % (url)
                if i > 3:
                    break
Example #3
0
def fetch(url, dbname = alldbname):
    """
        根据指定的url抓取资源信息,存到数据库中
        此页面必须是直接有链接的页面
    """
    try:
        doc = urllib2.urlopen(url, timeout=10)
    except:
        print 'open url Err, url:%s' % (url)
        return
    try:
        soup = BeautifulSoup.BeautifulSoup(doc.read())
        souptrs = BeautifulSoup.BeautifulSoup(str(soup.findAll('tr')))
        
    except:
        print 'BeautifulSoup Err'
        return

    i=0
    for tr in souptrs.contents[2:]:
        i = i+1
        #print i
        if i > 30:
            break
            
        if hasattr(tr, 'name'):
            #获取资源名称,类别,链接地址及大小
            
            try:
                acollect = tr.findAll('a')
                typeL1 = ''.join(acollect[0].contents)
                typeL2 = ''.join(acollect[1].contents)
                #改一下分类名
                if typeL2 == 'PC' and typeL1 == 'Games':
                    typeL2 = 'PC Games'
                name = ''.join(acollect[2].contents)
                magnet = acollect[3]['href']
                font = tr.findAll('font')
                sizelazy = ''.join(font[0].contents[0])
                #获取大小,不用费心看了,严重依赖于格式
                size = sizelazy[sizelazy.find('Size') + 5:sizelazy.find('iB') + 2].replace(ur' ', '')

                #判定hotrank
                hotrank = hotrank_tpb_weighted
                if url.find('top') > 0:
                    hotrank += hotrank_top_weighted

                print "name:%s, typeL1:%s, typeL2:%s, size:%s" % (name, typeL1, typeL2, size)
                util_db.insert('all_resource', resource_name = name,
                                                 typeL1 = typeL1, typeL2 = typeL2, magnet = magnet, size = size,
                                                 hotrank = hotrank, extern_info = 'False', language = 'EN', ed2k = '')
            except Exception as inst:
                #i = i + 1
                print 'fetch resouce url Err, url:%s' % (url)
                print inst
Example #4
0
    def magic_fetch_and_insert(self):

        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-cn,zh;q=0.5",
            "Connection": "keep-alive",
            "Cookie": "37cs_user=37cs13650143833; Hm_lvt_ce396a90f02f136fc25a1bfc9138c834=1331695718366; 37cs_show=1%2C26; PHPSESSID=r3okpuu9o47bt9u32epkii08i2; 37cs_pidx=4; Hm_lpvt_ce396a90f02f136fc25a1bfc9138c834=1331695718366",
            "Host": "oabt.org",
            "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2",
        }
        req = urllib2.Request(self.url, headers=headers)
        doc = urllib2.urlopen(req, timeout=10).read()

        page_soup = BeautifulSoup.BeautifulSoup(doc)
        tables = page_soup.findAll("table", cellspacing="0")
        tables = tables[3].contents[4:]
        for tr in tables:
            size = tr.contents[1].contents[9].contents[0]  # size
            typeL2 = tr.contents[1].contents[1].contents[0].contents[0]  # type
            typeL1 = "Video"
            if typeL2 == u"泰剧":
                typeL2 = "PC Games"
                typeL1 = "Games"
            resource_name = tr.contents[1].contents[3].contents[0].contents[0]  # name
            magnet = tr.contents[1].contents[5].contents[1]["href"]  # magnet
            ed2k = tr.contents[1].contents[5].contents[2]["ed2k"]  # ed2k
            try:
                util_db.insert(
                    "all_resource",
                    resource_name=resource_name,
                    typeL1=typeL1,
                    typeL2=typeL2,
                    magnet=magnet,
                    size=size,
                    hotrank=hotrank_oabt_weighted,
                    extern_info="False",
                    language="CH",
                    ed2k=ed2k,
                )
            except:
                print "insert Err"
        print "OK"