def checkSite(site):
    siteId = site[0]
    url = site[2]
    print siteId, url

    db = SpeedyDb()
    wapple = Wapple()
    answers = wapple.scan(url)

    if answers is None or answers.__len__() == 0:
        return

    db.clearFeatures(siteId, MONTHID)


    try:
        for app in answers:
            categories = ""
            version = answers[app]["version"]
            for cat in answers[app]["categories"]:
                categories = cat["name"] + ","

            print ">> {0:<20} {1: <30} {2: <6}".format(app, categories.strip(","), version)
            db.saveFeatures(MONTHID, siteId, app, categories.strip(","), version)
    except:
        print 'failed: ', answers
def main(argv):
    monthid = 0 

    try:
        opts, args = getopt.getopt(argv, "m:", ['month'])
    except getopt.GetoptError:
        print 'mobilecheck.py -m <monthId>'
        sys.exit(2)

    for opt, arg in opts:
        if opt in ('-m', '--month'):
            monthid = arg 

    print 'MonthId [', monthid , ']'

    if monthid != 0:
        db = SpeedyDb();
   
        print 'running mobile check'
        
        sites = db.getUncheckedMobileSites(monthid)
        
        print 'Processing ', len(sites), ' sites'

        pool = Pool(processes=8)
        pool.map(processSite, sites)
        pool.close()
        pool.join()         
def getMonth():
    d = datetime.date.today()
    t = datetime.timedelta(days=5)
    d = d + t
    monthId = "{0:02d}{1}".format(d.month, d.year)

    speedy = SpeedyDb()
    id = speedy.getMonthByDate(d)
    speedy.cleanClose()
    return id
def nightlySpider(dayNum, threads):
    db = SpeedyDb()
    sites = db.getSpiderSites()

    start = (dayNum-1)*14
    end = dayNum * 14

    print ''
    print '---------------------------------------------------------------------'
    print 'processing: ', start , 'to', end, ':', threads, 'threads'
    print '---------------------------------------------------------------------'
    print ''

    SpiderSites(sites[start:end], threads)
Beispiel #5
0
class Sniffy(object):
	def __init__(self):
		self.db = SpeedyDb()
	
	def findBetas(self):
		sites = self.db.getSites()
		
		for site in sites:
			siteId = site[0]
			siteName = site[1]
			urlBits = urlparse(site[2])
		
			sitebit = urlBits.netloc.strip('www')
		
			self.checkSite('beta', sitebit)
			self.checkSite('new', sitebit)
			# self.checkSite('alpha', sitebit)

	def checkSite(self, type, sitebit):
	
		url = 'http://' + type + sitebit
		code = self.getSite(url)
			
		if (code == 200):
			print url
		
			
	def getSite(self, url):
		try:
			response = urllib2.urlopen(url, timeout=10)
			return response.code
		except:
			return 500
Beispiel #6
0
def runmonth(monthid):
	# stuff...	
	here = os.path.dirname(__file__)
	folder = os.path.join(here, "../results/{0}/html".format(monthid))

	sitecount = 0;

	con = lite.connect('speedyplus.db')
	cur = con.cursor()

	db = SpeedyDb()
	sites = db.getSites()
	for site in sites:
		siteName = site[1]
		siteFile = "{0}\\{1}.html".format(folder, siteName)
		print "{0:<3} {1:<25}".format(site[0], site[1]),
		
		if os.path.exists(siteFile):
			sitecount = sitecount + 1 
			print "{0:25}".format(os.path.split(siteFile)[1]),
			fo = open(siteFile, 'r')
			content = fo.read()
			trendyness = GetTheTrendy(content) 
			linkcount = linkCounter(content)
			words = CountTheWords(content)
			fo.close()
			
			sql = trendlySql_insert.format(site[0], monthid, trendyness, linkcount, words)
			
			# print sql 
			cur.execute(sql)
			con.commit()		
			print '{0:<2} {1:<4} {2}'.format(trendyness, linkcount, words),
		
		print '.' 

	print ''
	for i in range(len(trends)):
		print '{0:<30}: {1}\t{2:.0%}'.format(trends[i], trendcounts[i], percentage(trendcounts[i],sitecount))

	for word, count in c.most_common(100):
		print word, count 
def respider(groupsize, threads):
    db = SpeedyDb()
    sites = db.getSpiderSitesInError()

    siteCount = len(sites)
    nights = int(math.ceil(float(siteCount) / groupsize))
    size = int(math.ceil( siteCount / nights))
    day = datetime.datetime.today().day - 1

    group = (day % nights)+1
    start = group * size;
    end =  min((group * size) + size, siteCount)

    print '    Performing recrawl from sites in error ( currently', siteCount , ')'
        
    print '    Day:', day, '. Group:', group, 
    print '. Start:', start, ". End:", end
    print ''

    print r'------------------------------------------------------------------'

    SpiderSites(sites[start:end], threads)
class MobileCheck(object):

    def __init__(self):
        self.db = SpeedyDb();
        

    def runCheck(self, monthId):
    
        print 'running mobile check'
        
        sites = self.db.getUncheckedMobileSites(monthId)
        
        print 'Processing ', len(sites), ' sites'

       #for site in sites:
       # 
       #     siteId = site[0]
       #     siteName = site[1]
       #     siteUrl = site[2]
       #     
       #     print ''
       #     print siteId, siteName, siteUrl, 
       #     self.getMobileResult(siteUrl, siteId, monthId)

    def getMobileResult(self, url, siteId, monthId):
    
        try:
            result = self.getMobileJson(url)
                        
            if result.__len__() > 10 :
                data = json.loads(result)
               
                usability = data['ruleGroups']['USABILITY']
                print usability['pass'],

                self.db.saveMobileCheck(siteId,monthId, usability['pass']);
        except Exception, e:
            print 'get mobile pass error:', url, e
def loaddata():
    db = SpeedyDb()
    sites = db.getSites()

    for site in sites:
        name = site[1]
        print name,
        spider_ok = True 
        data = getSiteInfo(name)
        if not (data is None):
            print 'Loading....', site[0], name, data['pages'], 

            db.saveLinkInfo(site[0], int(data['pages']), int(data['docs']), int(data['broken']), int(data['queued']))

            if int(data["pages"]) == 10000 or int(data['links']) == 20000 or int(data['broken']) == 1000 or int(data['queued']) > 0:
                spider_ok = False 
            db.setSpiderStatus(site[0], spider_ok)
            # print spider_ok

        domains = getDomains(site[0], site[2], name, db)
Beispiel #10
0
def main(argv):
    db = SpeedyDb()
    sites = db.getSites()

    CheckSites(sites, THREAD_COUNT)
 def __init__(self):
     self.db = SpeedyDb();
def main(argv):
	monthid = 0 
	single = 0;

	try:
		opts, args = getopt.getopt(argv, "lhm:s:", ['month','list', 'single'])
	except getopt.GetoptError:
		print 'SpeedyPlus.py -m <monthId> [-s <siteid>]'
		sys.exit(2)
		
	for opt, arg in opts:
		if opt == '-h':
			print 'SpeedyPlus.py -m <monthId>'
			sys.exit()
		elif opt in ('-m', '--month'):
			monthid = arg 
		elif opt in ('-l', '--list'):
			s = SpeedyDb()
			s.listMonths()
			sys.exit()
		elif opt in ('-s', '--single'):
			single = arg 
			
	
	print 'MonthId [', monthid , ']'
	
	if monthid != 0:

		s = SpeedyDb()

		if single != 0:
			#process just one site. (ignore valid month thing)			
			ps = PageSpeedy()
			ps.ProcessSingleSite(single, monthid)
			
			wp = wapple.SpeedyWapple()
			wp.ProcessSingleSite(single, monthid)
			
			#ch = checker.AChecker()
			#ch.ProcessSingleSite(single, monthid)
			
			sys.exit()			
		elif s.validMonth(monthid) == 1 :
			s.backup(monthid)	
		
		# pagespeed check
		#	ps = PageSpeedy()
		#	ps.runSpeedy(monthid)
			
		# wapplizer check
			wp = SpeedyWapple()
			wp.process(monthid)

		# peeky (extra looking)			
			pky = Peeky()
			pky.goPeek(monthid)
			pky.close()

		# screengrabs
			grab = ScreenGrabby()
			grab.runGrabby(monthid)
			
		# accessilbity check
		#	ch = checker.AChecker()
		#	ch.runChecker(monthid)
			
			s.closeMonth(monthid)
		else:
			print 'not a valid month'
def main(argv):
    monthid = 0
    single = 0

    try:
        opts, args = getopt.getopt(argv, "lhmn:s:", ["month", "list", "single", "now"])

    except getopt.GetoptError:
        print "SpeedyPlus.py -m <monthId> [-s <siteid>]"
        sys.exit(2)

    for opt, arg in opts:
        if opt == "-h":
            print "SpeedyPlus.py -m <monthId>"
            sys.exit()

        elif opt in ("-m", "--month"):
            monthid = arg

        elif opt in ("-l", "--list"):
            s = SpeedyDb()
            s.listMonths()
            sys.exit()

        elif opt in ("-s", "--single"):
            single = arg

        elif opt in ("-n", "--now"):
            monthid = getMonth()

    print "MonthId [", monthid, "]"

    if monthid != 0:

        s = SpeedyDb()

        if single != 0:
            # process just one site. (ignore valid month thing)
            ps = PageSpeedy()
            ps.ProcessSingleSite(single, monthid)

            # wp = wapple.SpeedyWapple()
            # wp.ProcessSingleSite(single, monthid)

            # ch = checker.AChecker()
            # ch.ProcessSingleSite(single, monthid)

            sys.exit()
        elif s.validMonth(monthid) == 1:
            s.backup(monthid)

            # pagespeed check
            ps = PageSpeedy()
            ps.runSpeedy(monthid)

            # wapplizer check
            # 	wp = SpeedyWapple()
            # 	wp.process(monthid)

            # peeky (extra looking)
            # 	pky = Peeky()
            # 	pky.goPeek(monthid)
            # 	pky.close();

            # screengrabs
            # 	grab = ScreenGrabby()
            # 	grab.runGrabby(monthid)

            # accessilbity check
            # 	ch = checker.AChecker()
            # 	ch.runChecker(monthid)

            s.closeMonth(monthid)
        else:
            print "not a valid month"