def spider_crawl(): print 'Crawling at gtainside homepage...' global _info global _info2 spider_homepage = SpiderHomePage() spider_homepage.narrow_collect_range() spider_homepage.set_type_info() topics = _info.keys() while True: print 'Crawling at gtainside topicpage:' print ' There have:' for topic in topics: print ' id:%3d - (%-3s/%-12s)' % ( int(_info[topic]['id']), _info[topic]['ver'], _info[topic]['type']) print 'Please input: [id], [startpage], [endpage]', print 'to start crawl.' print 'Note1: Either page set to 0 will crawl to last page.' print 'Note2: Input \'finish\' will finish the crawl' r = raw_input('->') if r.startswith('finish'): print 'Collect action at gtainside finished.' pause() break if r.count(',') is not 2: print 'Please input with specific format: [id], [start] [end]' print 'Note: Either page set to 0 will crawl to last page.' pause() continue i, st, ed = r.split(',') i, st, ed = i.strip(), st.strip(), ed.stripc() if i.isdigit() is False or st.isdigit() is False \ or ed.isdigit() is False or st <= ed: print 'Please input with specific format: [id], [start] [end]' print 'Note: Either page set to 0 will crawl to last page.' pause() continue if _info2.has_key(i) is not True: print '[Input error] ID:', i, 'do not find.' pause() continue topiclink = _info2[i] ed = 0 if ed == -1 else ed spider = SpiderTopicPage(topiclink, ed) topicpage.set_maximum_depth() topicpage.get_info()
def spider_crawl(): while True: print 'Crawling at gtabbs' print 'Please input crawling range/pages(Recommendation: 1 - 300)' r = raw_input('Please input: [min], [max] - to ensure range/pages\n->') if r.count(',') is not 1: print 'Please input with specific format: [min], [max]' pause() continue st, ed = r.split(',') st, ed = st.strip(), ed.strip() if not (st.isdigit() is True and ed.isdigit() is True and st < ed): print 'Please input with specific format: [min], [max]' pause() continue link_pages = ['http://www.gtabbs.com/bbs-141-%d' % i for i in range(int(st), int(ed))] for link_page in link_pages: link_topics = SpiderTopicPage(link_page).get_topics() for link_topic in link_topics: spider = SpiderTopicContent(link_topic) mod = modinfo.ModInfo(link_topic) mod.updatekey('site', 'http://www.gtabbs.com') mod.updatekey('link', link_topic) mod.updatekey('has_att', spider.detect_attachment()) mod.updatekey('name', spider.get_name()) mod.updatekey('type', '') mod.updatekey('subtype', '') mod.updatekey('ver', spider.get_gtaver()) mod.updatekey('imglink', spider.get_img()) mod.updatekey('publisher', spider.get_publisher()) mod.updatekey('date', strftime('%Y%m%d%H%M%S')) mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S')) print 'Collected: %s' % link_topic #mod.show() #break #modinfo.show() filename = 'gtabbs_%s.pkl' % strftime('%Y%m%d%H%M%S') modinfo.dump(filename) print 'Collect action at gtagarage finished.' print 'Data store at file:', filename pause() break
def spider_crawl(): while True: print 'Crawling at gtagarage' print 'Please input crawling range/index(Recommendation: 0 - 20000)' r = raw_input('Please input: [min], [max] - to ensure range/index\n->') if r.count(',') is not 1: print 'Please input with specific format: [min], [max]' pause() continue st, ed = r.split(',') st, ed = st.strip(), ed.strip() if not (st.isdigit() is True and ed.isdigit() is True and st < ed): print 'Please input with specific format: [min], [max]' pause() continue links = ['http://www.gtagarage.com/mods/show.php?id=%d' % i for i in range(int(st), int(ed))] for link in links: spider = SpiderLinkPage(link) mod = modinfo.ModInfo(link) mod.updatekey('site', 'http://www.gtagarage.com') mod.updatekey('link', link) mod.updatekey('authorlink', spider.get_mod_authorlink()) mod.updatekey('dldlink', spider.get_mod_dldlink()) mod.updatekey('imglink', spider.get_mod_imglink()) mod.updatekey('name', spider.get_mod_name()) mod.updatekey('type', spider.get_mod_type()) mod.updatekey('subtype', spider.get_mod_subtype()) mod.updatekey('ver', spider.get_mod_gtaver()) mod.updatekey('author', spider.get_mod_author()) mod.updatekey('status', spider.get_mod_status()) mod.updatekey('date', spider.get_mod_lastupdated()) mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S')) print 'Collected: %s' % link #mod.show() #modinfo.show() filename = 'gtagarage_%s.pkl' % strftime('%Y%m%d%H%M%S') modinfo.dump(filename) modinfo.clear() print 'Collect action at gtagarage finished.' print 'Data store at file:', filename pause() break
def get_info(self, cur_depth): fac_depth = ( self.maximum_depth if (self.depth >= self.maximum_depth or self.depth == 0) else self.depth) while cur_depth < fac_depth: cur_link = format("%s&start=%d&orderBy=" % (self.link, cur_depth * 7)) self.cont = spiderutils.openurlex(cur_link).read() cur_depth += 1 # collect info name_iter = re.finditer( r'Title:</B></TD>\s+<TD><B>(.*?)</B></TD>', self.cont) author_iter = re.finditer( r'Author:</TD>\s+<TD>(.*?)</TD>', self.cont) date_iter = re.finditer( r'Date:</TD>\s+<TD>(.*?)</TD>', self.cont) img_iter = re.finditer( r'Image:</TD>\s+<TD><img src="(.*?)"><BR><BR></TD>', self.cont) id_iter_forview = re.finditer( r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>' , self.cont) id_iter_fordld = re.finditer( r'<BR><center><a href="[\D]+(.*?)"><.*?><B>DOWNLOAD</B>' , self.cont) mod_name = (name.group(1) for name in name_iter) mod_author = (author.group(1) for author in author_iter) mod_date = (date.group(1) for date in date_iter) mod_img = ( ("%s%s%s" % ( self.info["homepage"], self.info["imglink"], imglink)) for imglink in (imglink.group(1) for imglink in img_iter)) mod_infopage = ( ("%s%s%d" % ( self.info["homepage"], self.info["infopage"], int(index)) for index in (index.group(1) for index in id_iter_forview))) mod_dldlink = ( ("%s%s%d" % ( self.info["homepage"], self.info["dldlink"], int(index)) for index in (index.group(1) for index in id_iter_fordld))) # store info for mod_infopage in mod_infopage: mod = modinfo.ModInfo(mod_infopage) mod.updatekey('site', 'http://www.gtainside.com') mod.updatekey('link', mod_infopage) mod.updatekey('name', mod_name.next()) mod.updatekey('type', get_type_fromlink(self.link)) mod.updatekey('subtype', '') mod.updatekey('ver', get_ver_fromlink(self.link)) mod.updatekey('imglink', mod_img.next()) mod.updatekey('dldlink', mod_dldlink.next()) mod.updatekey('author', mod_name.next()) mod.updatekey('date', mod_date.next()) mod.updatekey('collecttime', strftime('%Y%m%d%H%M%S')) print 'Collected: %s' % mod_infopage #mod.show() #break #modinfo.show() filename = 'gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S') modinfo.dump('gtainside_%s.pkl' % strftime('%Y%m%d%H%M%S')) modinfo.clear() print 'Single collect action at gtainside finished.' print 'Data store at file:', filename pause()