def run(): baseurl = "http://www.gaybydegree.org.uk/index.php?dir=university&task=university" print green("> GayByDegree.co.uk scrapping starts now") checklist_str = { "Policy that protects LGB students from bullying" : "policy", "Compulsory Staff Training on LGB issues" : "training", "Society for LGB students" : "society", "Info for students on LGB issues" : "info", "Stonewall Diversity Champion" : "diversity", "Events for LGB students" : "events", "Explicit welfare support for LGB students" : "welfare", "Consultation with LGB students" : "consulation", "Specific career advice for LGB students" : "career" } soup = common.fetch_page(baseurl, "html5lib") # awkward shit = ['university', 'of', 'the'] for uni in common.universites.values(): print green(">> %s..." % uni), # Sanatize into what the website uses uni = ' '.join( filter( lambda x: x.lower() not in shit, uni.split(' ') ) ) # Now go for it! l = soup.find("a", text=re.compile(uni) ) link = htmlentities.decode(l['href']) page_soup = common.fetch_page("http://www.gaybydegree.org.uk/%s" % link, "html5lib") print green(bold(" got information")) checklist = {} for (txt, db_term) in checklist_str.items(): p = page_soup.find(text=txt) i = p.parent.find("img") has = "greentick" in i['src'] if has: print green(">>> Has: %s" % txt) else: print red(">>> Does not have: %s" % txt) checklist[db_term] = has print checklist
def facts(): payload = { 'escape': 'javascript' } fact = rq.get(chuck_url,params=payload).json()['value']['joke'] return decode(fact)
def test_should_decode_basic_entities(self): self.assertEqual('&', htmlentities.decode('&')) self.assertEqual('"', htmlentities.decode('"')) self.assertEqual('<', htmlentities.decode('<'))
def facts(): fact = rq.get(base_url+defaults).json()[0]['fact'].replace(''','\'') return decode(fact)
def test_should_decode_utf8_accents(self): self.assertEqual(u'é', htmlentities.decode('é')) self.assertEqual(u'ê', htmlentities.decode('ê'))
word1 = word1.replace(" ","\ ").replace("?","\?").replace("(","\(").replace("*","\*").replace(")","\)") process = subprocess.Popen("grep -lir '"+word1+"' "+path_change+"*", stdout=subprocess.PIPE, shell=True) (files, error) = process.communicate() files = files.split('\n') files.pop() if not auto_complete: print('(y/n)\nDo you want change \''+word1+'\' in...\n') for file in files: if not file in(script_name, file_with_words): if not auto_complete: answer = raw_input(file+'? ') else: print file + '\n' answer = '' if answer in ('yes','y') or auto_complete: f = codecs.open(file, encoding='iso-8859-15', mode='r') data = f.read() f.close() data = htmlentities.decode(data) data = ''.join((c for c in unicodedata.normalize('NFD', data) if unicodedata.category(c) != 'Mn')) data = re.sub(word1,word2,data) if not flag_complete: data = re.sub(word1.lower(),word2.lower(),data) data = re.sub(word1.upper(),word2.upper(),data) f = codecs.open(file, encoding='iso-8859-15', mode='w') f.write(data) f.close() print('\nCOMPLETED THE CHANGE OF WORDS')
def facts(): payload = {'escape': 'javascript'} fact = rq.get(chuck_url, params=payload).json()['value']['joke'] return decode(fact)