def __init__(self, link, depth = 0): self.link = link self.depth = depth self.info = { "homepage": "http://www.gtainside.com/en", "infopage": "/download.php?do=comments&id=", "dldlink": "/download.php?do=download&id=", "imglink": "/"} self.cont = spiderutils.openurlex(link).read()
def __init__(self, link): self.link = link self.content = spiderutils.openurlex(link).read() #default encode at GTABBS.com is utf-8, re-encode in gb2312 self.content = self.content.decode('utf-8').encode('gb2312', 'replace') self.kwd = {"attachment":'href="job.php?action=download&aid'} mst = re.search(r'<div class="readContent">', self.content) med = re.search(r'<div id="mark_tpc"', self.content) if mst is not None and med is not None: self.main_topic_content = self.content[mst.start() : med.end()] else: pass
def __init__(self): self.cont = spiderutils.openurlex("http://www.gtainside.com").read() self.info = { "homepage":"http://www.gtainside.com", "type_link":"/en/download.php?do=cat&main_cat=" } self.ver = { "GTA IV": "IV", "GTA:SanAndreas": "SA", "GTA:ViceCity": "VC", "GTA III": "III", "GTA:LCS": "LCS", "GTA:VCS": "VCS", "IV": "GTA IV", "SA": "GTA:SanAndreas", "VC": "GTA:ViceCity", "III": "GTA III", "LCS": "GTA:LCS", "VCS": "GTA:VCS" }
def __init__(self, link): self.link = link self.cont = spiderutils.openurlex(link).read() self.info = { "homepage":"http://www.gtainside.com", "subtypelink":"/download.php?do=cat&id=" } self.ver = { "GTA IV": "IV", "GTA:SanAndreas": "SA", "GTA:ViceCity": "VC", "GTA III": "III", "GTA:LCS": "LCS", "GTA:VCS": "VCS", "IV": "GTA IV", "SA": "GTA:SanAndreas", "VC": "GTA:ViceCity", "III": "GTA III", "LCS": "GTA:LCS", "VCS": "GTA:VCS" }
def __init__(self, link): self.data = {"link":link} self.content = spiderutils.openurlex(link).read() self.kwd = {"authorlink":"www.gtagarage.com/users/profile.php?M=", "dldlink":"www.gtagarage.com/mods/download.php?f=", "imglink":"http://media.gtanet.com/gtagarage/files/image_%s.jpg"}
def get_info(self, cur_depth): fac_depth = ( self.maximum_depth if (self.depth >= self.maximum_depth or self.depth == 0) else self.depth) while cur_depth < fac_depth: cur_link = format("%s&start=%d&orderBy=" % (self.link, cur_depth * 7)) self.cont = spiderutils.openurlex(cur_link).read() cur_depth += 1 # collect info name_iter = re.finditer( r'Title:</B></TD>\s+<TD><B>(.*?)</B></TD>', self.cont) author_iter = re.finditer( r'Author:</TD>\s+<TD>(.*?)</TD>', self.cont) date_iter = re.finditer( r'Date:</TD>\s+<TD>(.*?)</TD>', self.cont) img_iter = re.finditer( r'Image:</TD>\s+<TD><img src="(.*?)"><BR><BR></TD>', self.cont) id_iter_forview = re.finditer( r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>' , self.cont) id_iter_fordld = re.finditer( r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>' , self.cont) mod_name = (name.group(1) for name in name_iter) mod_author = (author.group(1) for author in author_iter) mod_date = (date.group(1) for date in date_iter) mod_img = ( ("%s%s%s" % ( self.info["homepage"], self.info["imglink"], imglink)) for imglink in (imglink.group(1) for imglink in img_iter)) mod_infopage = ( ("%s%s%d" % ( self.info["homepage"], self.info["infopage"], int(index)) for index in (index.group(1) for index in id_iter_forview))) mod_dldlink = ( ("%s%s%d" % ( self.info["homepage"], self.info["dldlink"], int(index)) for index in (index.group(1) for index in id_iter_fordld))) # store info for mod_infopage in mod_infopage: mod = modinfo.ModInfo(mod_infopage) mod.updatekey('site', 'http://www.gtainside.com') mod.updatekey('link', mod_infopage) mod.updatekey('name', mod_name.next()) mod.updatekey('type', get_type_fromlink(self.link)) mod.updatekey('subtype', '') mod.updatekey('ver', get_ver_fromlink(self.link)) mod.updatekey('imglink', mod_img.next()) mod.updatekey('dldlink', mod_dldlink.next()) mod.updatekey('author', mod_name.next()) mod.updatekey('date', mod_date.next()) mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S')) print 'Collected: %s' % mod_infopage #mod.show() #break #modinfo.show() filename = 'gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S') modinfo.dump('gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S')) modinfo.clear() print 'Single collect action at gtainside finished.' print 'Data store at file:', filename pause()
def __init__(self, link): self.link = link self.content = spiderutils.openurlex(link).read() #default encode at GTABBS.com is utf-8, re-encode in gb2312 self.content = self.content.decode('utf-8').encode('gb2312', 'replace') self.info = {"link" : "http://www.gtabbs.com/"}