Python openurlex Examples, spiderutils.openurlex Python Examples

Example #1

0

Show file

File: spider_gtainside.py Project: itsyzz/gtamod_spider

 def __init__(self, link, depth = 0):
     self.link = link
     self.depth = depth
     self.info = {
         "homepage": "http://www.gtainside.com/en",
         "infopage": "/download.php?do=comments&id=",
         "dldlink": "/download.php?do=download&id=",
         "imglink": "/"}
     self.cont = spiderutils.openurlex(link).read()

Example #2

0

Show file

File: spider_gtabbs.py Project: itsyzz/gtamod_spider

 def __init__(self, link):
     self.link = link
     self.content = spiderutils.openurlex(link).read() 
     #default encode at GTABBS.com is utf-8, re-encode in gb2312
     self.content = self.content.decode('utf-8').encode('gb2312', 'replace')
     
     self.kwd = {"attachment":'href="job.php?action=download&aid'}
     
     mst = re.search(r'<div class="readContent">', self.content)
     med = re.search(r'<div id="mark_tpc"', self.content)
     if mst is not None and med is not None:
         self.main_topic_content = self.content[mst.start() : med.end()]
     else:
         pass

Example #3

0

Show file

File: spider_gtainside.py Project: itsyzz/gtamod_spider

 def __init__(self):
     self.cont = spiderutils.openurlex("http://www.gtainside.com").read()
     self.info = {
         "homepage":"http://www.gtainside.com",
         "type_link":"/en/download.php?do=cat&main_cat="
         }
     self.ver = {
         "GTA IV": "IV",
         "GTA:SanAndreas": "SA",
         "GTA:ViceCity": "VC",
         "GTA III": "III",
         "GTA:LCS": "LCS",
         "GTA:VCS": "VCS",
         "IV": "GTA IV",
         "SA": "GTA:SanAndreas",
         "VC": "GTA:ViceCity",
         "III": "GTA III",
         "LCS": "GTA:LCS",
         "VCS": "GTA:VCS"
         }

Example #4

0

Show file

File: (bug2)spider_gtainside.py Project: itsyzz/gtamod_spider

 def __init__(self, link):
     self.link = link
     self.cont = spiderutils.openurlex(link).read()
     self.info = {
         "homepage":"http://www.gtainside.com",
         "subtypelink":"/download.php?do=cat&id="
         }
     self.ver = {
         "GTA IV": "IV",
         "GTA:SanAndreas": "SA",
         "GTA:ViceCity": "VC",
         "GTA III": "III",
         "GTA:LCS": "LCS",
         "GTA:VCS": "VCS",
         "IV": "GTA IV",
         "SA": "GTA:SanAndreas",
         "VC": "GTA:ViceCity",
         "III": "GTA III",
         "LCS": "GTA:LCS",
         "VCS": "GTA:VCS"
         }

Example #5

0

Show file

File: spider_gtagarage.py Project: itsyzz/gtamod_spider

 def __init__(self, link):
     self.data = {"link":link}
     self.content = spiderutils.openurlex(link).read()
     self.kwd = {"authorlink":"www.gtagarage.com/users/profile.php?M=",
                 "dldlink":"www.gtagarage.com/mods/download.php?f=",
                 "imglink":"http://media.gtanet.com/gtagarage/files/image_%s.jpg"}

Example #6

0

Show file

File: spider_gtainside.py Project: itsyzz/gtamod_spider

    def get_info(self, cur_depth):
        fac_depth = (
            self.maximum_depth
            if (self.depth >= self.maximum_depth or self.depth == 0)
            else self.depth)
        while cur_depth < fac_depth:
            cur_link = format("%s&start=%d&orderBy=" %
                              (self.link, cur_depth * 7))
            self.cont = spiderutils.openurlex(cur_link).read()
            cur_depth += 1

            # collect info
            name_iter = re.finditer(
                r'Title:</B></TD>\s+<TD><B>(.*?)</B></TD>', self.cont)
            author_iter = re.finditer(
                r'Author:</TD>\s+<TD>(.*?)</TD>', self.cont)
            date_iter = re.finditer(
                r'Date:</TD>\s+<TD>(.*?)</TD>', self.cont)
            img_iter = re.finditer(
                r'Image:</TD>\s+<TD><img src="(.*?)"><BR><BR></TD>', self.cont)
            id_iter_forview = re.finditer(
                r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>'
                , self.cont)
            id_iter_fordld = re.finditer(
                r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>'
                , self.cont)
            mod_name = (name.group(1) for name in name_iter)
            mod_author = (author.group(1) for author in author_iter)
            mod_date = (date.group(1) for date in date_iter)
            mod_img = (
                ("%s%s%s" % (
                    self.info["homepage"],
                    self.info["imglink"],
                    imglink))
                for imglink in (imglink.group(1) for imglink in img_iter))
            mod_infopage = (
                ("%s%s%d" % (
                    self.info["homepage"],
                    self.info["infopage"],
                    int(index))
                for index in (index.group(1) for index in id_iter_forview)))
            mod_dldlink = (
                ("%s%s%d" % (
                    self.info["homepage"],
                    self.info["dldlink"],
                    int(index))
                 for index in (index.group(1) for index in id_iter_fordld)))

            # store info
            for mod_infopage in mod_infopage:
                mod = modinfo.ModInfo(mod_infopage)
                mod.updatekey('site', 'http://www.gtainside.com')
                mod.updatekey('link', mod_infopage)
                mod.updatekey('name', mod_name.next())
                mod.updatekey('type', get_type_fromlink(self.link))
                mod.updatekey('subtype', '')
                mod.updatekey('ver', get_ver_fromlink(self.link))
                mod.updatekey('imglink', mod_img.next())
                mod.updatekey('dldlink', mod_dldlink.next())
                mod.updatekey('author', mod_name.next())
                mod.updatekey('date', mod_date.next())
                mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S'))
                print 'Collected: %s' % mod_infopage
                #mod.show()
                #break

        #modinfo.show()
        filename = 'gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S')
        modinfo.dump('gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S'))
        modinfo.clear()
        print 'Single collect action at gtainside finished.'
        print 'Data store at file:', filename
        pause()

Example #7

0

Show file

File: spider_gtabbs.py Project: itsyzz/gtamod_spider

 def __init__(self, link):
     self.link = link
     self.content = spiderutils.openurlex(link).read() 
     #default encode at GTABBS.com is utf-8, re-encode in gb2312
     self.content = self.content.decode('utf-8').encode('gb2312', 'replace')
     self.info = {"link" : "http://www.gtabbs.com/"}