def checkSite(site): siteId = site[0] url = site[2] print siteId, url db = SpeedyDb() wapple = Wapple() answers = wapple.scan(url) if answers is None or answers.__len__() == 0: return db.clearFeatures(siteId, MONTHID) try: for app in answers: categories = "" version = answers[app]["version"] for cat in answers[app]["categories"]: categories = cat["name"] + "," print ">> {0:<20} {1: <30} {2: <6}".format(app, categories.strip(","), version) db.saveFeatures(MONTHID, siteId, app, categories.strip(","), version) except: print 'failed: ', answers
def main(argv): monthid = 0 try: opts, args = getopt.getopt(argv, "m:", ['month']) except getopt.GetoptError: print 'mobilecheck.py -m <monthId>' sys.exit(2) for opt, arg in opts: if opt in ('-m', '--month'): monthid = arg print 'MonthId [', monthid , ']' if monthid != 0: db = SpeedyDb(); print 'running mobile check' sites = db.getUncheckedMobileSites(monthid) print 'Processing ', len(sites), ' sites' pool = Pool(processes=8) pool.map(processSite, sites) pool.close() pool.join()
def getMonth(): d = datetime.date.today() t = datetime.timedelta(days=5) d = d + t monthId = "{0:02d}{1}".format(d.month, d.year) speedy = SpeedyDb() id = speedy.getMonthByDate(d) speedy.cleanClose() return id
def nightlySpider(dayNum, threads): db = SpeedyDb() sites = db.getSpiderSites() start = (dayNum-1)*14 end = dayNum * 14 print '' print '---------------------------------------------------------------------' print 'processing: ', start , 'to', end, ':', threads, 'threads' print '---------------------------------------------------------------------' print '' SpiderSites(sites[start:end], threads)
class Sniffy(object): def __init__(self): self.db = SpeedyDb() def findBetas(self): sites = self.db.getSites() for site in sites: siteId = site[0] siteName = site[1] urlBits = urlparse(site[2]) sitebit = urlBits.netloc.strip('www') self.checkSite('beta', sitebit) self.checkSite('new', sitebit) # self.checkSite('alpha', sitebit) def checkSite(self, type, sitebit): url = 'http://' + type + sitebit code = self.getSite(url) if (code == 200): print url def getSite(self, url): try: response = urllib2.urlopen(url, timeout=10) return response.code except: return 500
def runmonth(monthid): # stuff... here = os.path.dirname(__file__) folder = os.path.join(here, "../results/{0}/html".format(monthid)) sitecount = 0; con = lite.connect('speedyplus.db') cur = con.cursor() db = SpeedyDb() sites = db.getSites() for site in sites: siteName = site[1] siteFile = "{0}\\{1}.html".format(folder, siteName) print "{0:<3} {1:<25}".format(site[0], site[1]), if os.path.exists(siteFile): sitecount = sitecount + 1 print "{0:25}".format(os.path.split(siteFile)[1]), fo = open(siteFile, 'r') content = fo.read() trendyness = GetTheTrendy(content) linkcount = linkCounter(content) words = CountTheWords(content) fo.close() sql = trendlySql_insert.format(site[0], monthid, trendyness, linkcount, words) # print sql cur.execute(sql) con.commit() print '{0:<2} {1:<4} {2}'.format(trendyness, linkcount, words), print '.' print '' for i in range(len(trends)): print '{0:<30}: {1}\t{2:.0%}'.format(trends[i], trendcounts[i], percentage(trendcounts[i],sitecount)) for word, count in c.most_common(100): print word, count
def respider(groupsize, threads): db = SpeedyDb() sites = db.getSpiderSitesInError() siteCount = len(sites) nights = int(math.ceil(float(siteCount) / groupsize)) size = int(math.ceil( siteCount / nights)) day = datetime.datetime.today().day - 1 group = (day % nights)+1 start = group * size; end = min((group * size) + size, siteCount) print ' Performing recrawl from sites in error ( currently', siteCount , ')' print ' Day:', day, '. Group:', group, print '. Start:', start, ". End:", end print '' print r'------------------------------------------------------------------' SpiderSites(sites[start:end], threads)
class MobileCheck(object): def __init__(self): self.db = SpeedyDb(); def runCheck(self, monthId): print 'running mobile check' sites = self.db.getUncheckedMobileSites(monthId) print 'Processing ', len(sites), ' sites' #for site in sites: # # siteId = site[0] # siteName = site[1] # siteUrl = site[2] # # print '' # print siteId, siteName, siteUrl, # self.getMobileResult(siteUrl, siteId, monthId) def getMobileResult(self, url, siteId, monthId): try: result = self.getMobileJson(url) if result.__len__() > 10 : data = json.loads(result) usability = data['ruleGroups']['USABILITY'] print usability['pass'], self.db.saveMobileCheck(siteId,monthId, usability['pass']); except Exception, e: print 'get mobile pass error:', url, e
def loaddata(): db = SpeedyDb() sites = db.getSites() for site in sites: name = site[1] print name, spider_ok = True data = getSiteInfo(name) if not (data is None): print 'Loading....', site[0], name, data['pages'], db.saveLinkInfo(site[0], int(data['pages']), int(data['docs']), int(data['broken']), int(data['queued'])) if int(data["pages"]) == 10000 or int(data['links']) == 20000 or int(data['broken']) == 1000 or int(data['queued']) > 0: spider_ok = False db.setSpiderStatus(site[0], spider_ok) # print spider_ok domains = getDomains(site[0], site[2], name, db)
def main(argv): db = SpeedyDb() sites = db.getSites() CheckSites(sites, THREAD_COUNT)
def __init__(self): self.db = SpeedyDb();
def main(argv): monthid = 0 single = 0; try: opts, args = getopt.getopt(argv, "lhm:s:", ['month','list', 'single']) except getopt.GetoptError: print 'SpeedyPlus.py -m <monthId> [-s <siteid>]' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'SpeedyPlus.py -m <monthId>' sys.exit() elif opt in ('-m', '--month'): monthid = arg elif opt in ('-l', '--list'): s = SpeedyDb() s.listMonths() sys.exit() elif opt in ('-s', '--single'): single = arg print 'MonthId [', monthid , ']' if monthid != 0: s = SpeedyDb() if single != 0: #process just one site. (ignore valid month thing) ps = PageSpeedy() ps.ProcessSingleSite(single, monthid) wp = wapple.SpeedyWapple() wp.ProcessSingleSite(single, monthid) #ch = checker.AChecker() #ch.ProcessSingleSite(single, monthid) sys.exit() elif s.validMonth(monthid) == 1 : s.backup(monthid) # pagespeed check # ps = PageSpeedy() # ps.runSpeedy(monthid) # wapplizer check wp = SpeedyWapple() wp.process(monthid) # peeky (extra looking) pky = Peeky() pky.goPeek(monthid) pky.close() # screengrabs grab = ScreenGrabby() grab.runGrabby(monthid) # accessilbity check # ch = checker.AChecker() # ch.runChecker(monthid) s.closeMonth(monthid) else: print 'not a valid month'
def main(argv): monthid = 0 single = 0 try: opts, args = getopt.getopt(argv, "lhmn:s:", ["month", "list", "single", "now"]) except getopt.GetoptError: print "SpeedyPlus.py -m <monthId> [-s <siteid>]" sys.exit(2) for opt, arg in opts: if opt == "-h": print "SpeedyPlus.py -m <monthId>" sys.exit() elif opt in ("-m", "--month"): monthid = arg elif opt in ("-l", "--list"): s = SpeedyDb() s.listMonths() sys.exit() elif opt in ("-s", "--single"): single = arg elif opt in ("-n", "--now"): monthid = getMonth() print "MonthId [", monthid, "]" if monthid != 0: s = SpeedyDb() if single != 0: # process just one site. (ignore valid month thing) ps = PageSpeedy() ps.ProcessSingleSite(single, monthid) # wp = wapple.SpeedyWapple() # wp.ProcessSingleSite(single, monthid) # ch = checker.AChecker() # ch.ProcessSingleSite(single, monthid) sys.exit() elif s.validMonth(monthid) == 1: s.backup(monthid) # pagespeed check ps = PageSpeedy() ps.runSpeedy(monthid) # wapplizer check # wp = SpeedyWapple() # wp.process(monthid) # peeky (extra looking) # pky = Peeky() # pky.goPeek(monthid) # pky.close(); # screengrabs # grab = ScreenGrabby() # grab.runGrabby(monthid) # accessilbity check # ch = checker.AChecker() # ch.runChecker(monthid) s.closeMonth(monthid) else: print "not a valid month"