def checkWebsite(): import requests logs_file = 'logs_website_invalid.csv' fcsv = open(logs_file, 'wb') writer = csv.writer(fcsv) writer.writerow(['id','name', 'error','url', 'finalurl']) idx = 1 unilist = loadData('list_0423.csv') for uni in unilist[1:]: idx = idx + 1 url = uni[4] if url == '': continue print url try: res = requests.get(url, timeout=20) except Exception, e: print e writer.writerow([uni[0], uni[1], 'exception', uni[4], '']) else: if res.status_code != 200: print res.status_code, res.history, res.url writer.writerow([uni[0], uni[1], 'badcode:' + str(res.status_code), uni[4], res.url.encode('utf-8')]) if res.status_code == 200 and res.history: print res.status_code, res.history, res.url writer.writerow([uni[0], uni[1], 'redirect', uni[4], res.url.encode('utf-8')])
def checkWebsite(): import requests logs_file = 'logs_website_invalid.csv' fcsv = open(logs_file, 'wb') writer = csv.writer(fcsv) writer.writerow(['id', 'name', 'error', 'url', 'finalurl']) idx = 1 unilist = loadData('list_0423.csv') for uni in unilist[1:]: idx = idx + 1 url = uni[4] if url == '': continue print url try: res = requests.get(url, timeout=20) except Exception, e: print e writer.writerow([uni[0], uni[1], 'exception', uni[4], '']) else: if res.status_code != 200: print res.status_code, res.history, res.url writer.writerow([ uni[0], uni[1], 'badcode:' + str(res.status_code), uni[4], res.url.encode('utf-8') ]) if res.status_code == 200 and res.history: print res.status_code, res.history, res.url writer.writerow([ uni[0], uni[1], 'redirect', uni[4], res.url.encode('utf-8') ])
def checkEmblem(): unilist = loadData('list_0423.csv') count = 0 for uni in unilist[1:]: imgname = uni[0] + '.png' if not os.path.exists('emblem_0422/' + imgname): print imgname, uni[1] count = count + 1 print count return
def correctWebsite(): sitelist = [] with open('logs_website_merge.csv', 'rb') as f: reader = csv.reader(f) sitelist = list(reader)[1:] sitemap = {} for s in sitelist: sitemap[s[0]] = s unilist = loadData('list_0425_2.csv') writer = csv.writer(open('list_0425_3.csv', 'wb')) writer.writerow(unilist[0]) count = 0 for uni in unilist[1:]: id = uni[0] if id in sitemap and sitemap[id][-1] != '': count = count + 1 uni[4] = sitemap[id][-1] writer.writerow(uni) print count
def googleCrawler(): from akaparser import loadData unilist = loadData('list_0423.csv') # uniname = 'Universidad Torcuato di Tella' # uniname = 'Universidad Nacional de Cordoba' count = 0 for uni in unilist[1:]: img = uni[0] + '.png' if os.path.exists('emblem_0413/' + img): continue if os.path.exists('html_google_2/' + uni[0] + '.html'): continue uniname = uni[1] # print uni count = count + 1 print count scrapeGoogle(uni[0], uniname) # break # print count return