def test_dots_fixup(): # Real-world example: # http://www.yelp.com/biz/orange-county-church-of-christ-irvine strange_url = 'http://.ocregion.com' normalized_url = E.recode_uri(strange_url) assert normalized_url == 'http://ocregion.com' # Extreme example strange_url = 'http://[email protected]....:8080/' normalized_url = E.recode_uri(strange_url) assert normalized_url == 'http://[email protected]:8080/'
def hihira_raupapa_kupu(kupu_hou, tohuto): # Looks up a single word to see if it is defined in maoridictionary.co.nz # Set tohuto = False to not ignore macrons when making the match # Returns True or False kupu_huarua = kupu_hou.lower() # If the macrons are not strict, they are removed for the best possibility of finding a match if tohuto: kupu_huarua = kupu_huarua.translate(kuare_tohuto) # Sets up an iterable of the word, and the word without double vowels to be searched. # This is because some texts use double vowels instead of macrons, and they return different search results. taurua = [kupu_huarua, whakatakitahi_oropuare(kupu_huarua)] # Sets up the variable to be returned, it is changed if a result is found wariutanga = False for kupu in taurua: taukaea = recode_uri( 'http://maoridictionary.co.nz/search?idiom=&phrase=&proverb=&loan=&histLoanWords=&keywords=' + kupu) hupa = BeautifulSoup(urlopen(taukaea), 'html.parser', from_encoding='utf8') tohu = hupa.find_all('h2') # The last two entries are not search results, due to the format of the website. for taitara in tohu[:-2]: taitara = taitara.text.lower() # Removes capitals and macrons for the best chance of making a match if kupu in (taitara.translate(kuare_tohuto).split() if tohuto else taitara.split()): wariutanga = True break else: pass print("Found " + kupu + ": " + str(wariutanga)) return wariutanga
def _urlOpen(conn, url, urlMD5): #html = urlopen(url) html = getByProxy(url) bsObj = BeautifulSoup(str(html.text), "html.parser") if not checkTitle(recode_uri(cleanhtml(bsObj.title))): print('check title is false and return') return False title = translate_text('ja', cleanhtml(bsObj.title)) params1 = bsObj.findAll("p", {"class": "qtext_para"}) bodyJa = [] bodyEn = [] for hoge in params1: if makeSentence(str(hoge)) != '': raw = makeSentence(hoge) bodyJa.append(translate_text('ja', raw)) bodyEn.append(raw) siteLogoUrl = 'https://qsf.ec.quoracdn.net/-3-images.logo.wordmark_default.svg-26-32753849bf197b54.svg' articleImageUrl = 'https://qsf.ec.quoracdn.net/-3-images.logo.wordmark_default.svg-26-32753849bf197b54.svg' siteTitleJa = title siteTitleRaw = cleanhtml(bsObj.title) bodyTextJa = "\n".join(bodyJa) bodyTextRaw = "\n".join(bodyEn) langCode = 'en' siteTitleJa = pretty.bodyTextJa(siteTitleJa) bodyTextJa = pretty.bodyTextJa(bodyTextJa) mydb.insert(conn, urlMD5, url, siteLogoUrl, articleImageUrl, siteTitleJa, siteTitleRaw, bodyTextJa, bodyTextRaw, langCode)
def test_letters_recoding(): """Tests that alphanumeric escapes get unquoted in recode_uri.""" url = ( u'http://💩.la/%74%68%69%73%5f%69%73%5f%61%5f%70%61%74%68' u'?a=b%3f%63&%64%3D%65=f#%73%70%61%63%65%73 %61%72%65 %64%61%6e%67%65%72%6f%75%73' ) assert E.recode_uri(url) == \ 'http://xn--ls8h.la/this_is_a_path?a=b%3fc&d%3De=f#spaces%20are%20dangerous'
def test_url_with_params(): url = ( 'http://ad.doubleclick.net/clk;217976351;41128009;f?' 'http%3A//www.24hourfitness.com/FindClubDetail.do?' 'clubid=189&edit=null&semiPromoCode=null&cm_mmc=' 'Yelp-_-ClubPage-_-BusinessListing-_-Link' ) assert E.recode_uri(url) == url
def test_worst_case_unicode(): # Test both unicode and utf8-bytes for umlaut in (u'ü', 'ü'): strange_url = worst_case(umlaut) normalized_url = E.recode_uri(strange_url) # I think this makes things more readable processed = normalized_url.replace('%C3%BC', '%') assert processed == 'http://%:%@www.xn--tda.com/%;%;%/%;%;%?%=%&%=%#!%/%'
def test_bad_bytes(): # This is unlikely to happen except due to gross programming error, # but I want to show what *would* happen. for bad_stuff in (u'\xFF', '\xFF', '%FF'): strange_url = worst_case(bad_stuff) normalized_url = E.recode_uri(strange_url) # I think this makes things more readable processed = normalized_url.replace('%C3%BF', '%') assert processed == 'http://%:%@www.xn--wda.com/%;%;%/%;%;%?%=%&%=%#!%/%'
def test_bad_bytes(): # This is unlikely to happen except due to gross programming error, # but I want to show what *would* happen. for bad_stuff in (u'\xFF', b'\xFF', u'%FF', b'%FF'): strange_url = worst_case(bad_stuff) normalized_url = E.recode_uri(strange_url) # I think this makes things more readable processed = normalized_url.replace('%C3%BF', '%') assert processed == 'http://%:%@www.xn--wda.com/%;%;%/%;%;%?%=%&%=%#!%/%'
def test_recode_encoded(charname, chars, pathchars, expected_url, encoding): url_template = u"http://m{chars}nchen.com/m{chars}chen/{pathchars}" unicode_url = url_template.format(chars=chars, pathchars=pathchars) try: encoded_url = unicode_url.encode(encoding) except UnicodeEncodeError: pytest.skip("Some of these things just won't go.") assert E.recode_uri(encoded_url) == expected_url quoted_url = url_template.format( chars=quote(chars.encode(encoding)), pathchars=quote(pathchars.encode(encoding)), ) if charname == 'ascii': # ASCII is a special case when it comes to quoting: their quoted-ness should go untouched. assert E.recode_uri(quoted_url) == quoted_url else: assert E.recode_uri(quoted_url) == expected_url
def test_worst_case_ascii(): for ascii_char in '<> ': strange_url = worst_case(ascii_char) normalized_url = E.recode_uri(strange_url) escaped_char = '%%%2X' % ord(ascii_char) assert escaped_char in normalized_url # I think this makes things more readable processed = normalized_url.replace(ascii_char, '{ascii_char}') processed = processed.replace(escaped_char, '%') assert processed == 'http://%:%@www.{ascii_char}.com/%;%;%/%;%;%?%=%&%=%#!%/%'
def test_yelp_scheme_url(): strange_url = 'yelp:///example' assert E.recode_uri(strange_url) == strange_url
def test_bad_port2(): # Similarly, we raise UnicodeError for strange_url = 'http://www.example.com:80wtf80/' with pytest.raises(UnicodeError): E.recode_uri(strange_url)
def test_url_with_hashbang(): # For a discussion of url hashbangs, see: http://www.jenitennison.com/blog/node/154 url = 'https://twitter.com/#!/YelpCincy/statuses/179565284020060161' assert E.recode_uri(url) == url
def test_param_xss(): assert E.recode_uri('/foo;<script>;baz/barney;%2F%3B%25;wilma' ) == '/foo;%3Cscript%3E;baz/barney;%2F%3B%25;wilma'
def test_mixed_quoting_param(): """Tests that a url with mixed quoting in the parameters has uniform quoting after requoting""" url = u'http://www.yelp.com?m%C3%BCnchen=münchen' assert E.recode_uri(url) == 'http://www.yelp.com?m%C3%BCnchen=m%C3%BCnchen'
def test_mixed_quoting_url(): """Test that a url with mixed quoting has uniform quoting after requoting""" url = u'http://www.yelp.com/m%C3%BCnchen/münchen' assert E.recode_uri(url) == 'http://www.yelp.com/m%C3%BCnchen/m%C3%BCnchen'
def test_unicode_url_gets_quoted(): url = u'http://www.yelp.com/münchen' assert E.recode_uri(url) == 'http://www.yelp.com/m%C3%BCnchen'
def test_recode_none_raises_attribute_error(): with pytest.raises(AttributeError): E.recode_uri(None)
def test_recode_unicode(charname, chars, pathchars, expected_url): del charname # passed, but unused url_template = u"http://m{chars}nchen.com/m{chars}chen/{pathchars}" unicode_url = url_template.format(chars=chars, pathchars=pathchars) assert E.recode_uri(unicode_url) == expected_url
def test_path_only_url(): strange_url = '/➨ ?➨ #➨ ' expected = '/%E2%9E%A8%20?%E2%9E%A8%20#%E2%9E%A8%20' processed = E.recode_uri(strange_url) assert processed == expected
def test_relative_url(): strange_url = '➨.ws/➨' expected = '%E2%9E%A8.ws/%E2%9E%A8' processed = E.recode_uri(strange_url) assert processed == expected
def test_utf8_url(): """Tests that a url with mixed quoting in multiple parameters has uniform quoting after requoting""" url = u'http://yelp.com/münchen/m%C3%BCnchen?münchen=m%C3%BCnchen&htmlchars=<">'.encode( 'utf-8') assert E.recode_uri(url) == \ 'http://yelp.com/m%C3%BCnchen/m%C3%BCnchen?m%C3%BCnchen=m%C3%BCnchen&htmlchars=%3C%22%3E'
def test_mixed_encoding(): """Tests that a url with mixed encoding has uniform encoding after recoding""" url = u'http://www.yelp.com/m%C3%BCnchen?m%FCnchen' assert E.recode_uri(url) == 'http://www.yelp.com/m%C3%BCnchen?m%C3%BCnchen'
def test_url_with_colon(): # Ticket: 31242 url = 'http://www.yelp.fr/biz/smalls-marseille#hrid:u_UQvMf97E8pD4HEb59uIw' assert E.recode_uri(url) == url
def test_utf8_url(): """Tests that a url with mixed quoting in multiple parameters has uniform quoting after requoting""" url = u'http://yelp.com/münchen/m%C3%BCnchen?münchen=m%C3%BCnchen&htmlchars=<">'.encode('utf-8') assert E.recode_uri(url) == \ 'http://yelp.com/m%C3%BCnchen/m%C3%BCnchen?m%C3%BCnchen=m%C3%BCnchen&htmlchars=%3C%22%3E'
def test_bad_domain(): # domain names with segments over length 64 are un-encodable by the idna codec strange_url = 'http://www.%s.com/' % ('x' * 64) with pytest.raises(UnicodeError): E.recode_uri(strange_url)
def test_multiple_escapes(): url = u'http://münch.com?zero=münch&one=m%C3%BCnch&two=m%25C3%25BCnch&three=m%2525C3%2525BCnch' assert E.recode_uri(url) == \ 'http://xn--mnch-0ra.com?zero=m%C3%BCnch&one=m%C3%BCnch&two=m%25C3%25BCnch&three=m%2525C3%2525BCnch'
def test_param_xss(): assert E.recode_uri('/foo;<script>;baz/barney;%2F%3B%25;wilma') == '/foo;%3Cscript%3E;baz/barney;%2F%3B%25;wilma'
def test_url_reserved_chars(): url = 'http://www.yelp.com?chars=%s' % quote(':/?&=') assert E.recode_uri(url) == url
def test_multi_params_for_individual_path_segment(): # Nothing (overly) strange in this url: nothing should be escaped url = '/foo;bar;baz/barney;fred;wilma' assert E.recode_uri(url) == url
def test_bad_user(): # This was previously throwing UnicodeError via idna codec, but I've # fixed it. strange_url = 'http://[email protected]/' assert E.recode_uri(strange_url) == strange_url
def test_recode_unicode(charname, chars, pathchars, expected_url): del charname # passed, but unused url_template = "http://m{chars}nchen.com/m{chars}chen/{pathchars}" unicode_url = url_template.decode('ascii').format(chars=chars, pathchars=pathchars) assert E.recode_uri(unicode_url) == expected_url
def test_url_with_params(): url = ('http://ad.doubleclick.net/clk;217976351;41128009;f?' 'http%3A//www.24hourfitness.com/FindClubDetail.do?' 'clubid=189&edit=null&semiPromoCode=null&cm_mmc=' 'Yelp-_-ClubPage-_-BusinessListing-_-Link') assert E.recode_uri(url) == url
def scraper(link_list, output_file_name): rec = [] # stores information from all tables error_links = [] # stores links that raised errors when trying to open count = 0 for url in link_list: url = recode_uri(url) # re-encoding potentially poorly encoded urls try: req = urllib.request.Request( url, data=None, headers={ 'User-Agent': 'Mozilla/5.0' } ) f = urllib.request.urlopen(req) soup = bs(f.read().decode('utf-8')) count += 1 print("count = " + str(count)) ### Info try: denumirea = soup.find("td", string = "Denumirea").find_next_sibling("td").contents[0] except: denumirea = "lipsa denumire" print(denumirea) try: parohia = soup.find("td", string = "Parohia").find_next_sibling("td").contents[0] except: parohia = "lipsa parohie" print(parohia) try: datare = soup.find("td", string = "Datare").find_next_sibling("td").contents[0] except: datare = "lipsa datare" print(datare) try: tip = soup.find("td", string = "Tip").find_next_sibling("td").contents[0] except: tip = "lipsa tip" print(tip) try: link_harta = soup.find("td", string = "Localizare pe hartă").find_next_sibling("td").contents[0]['href'] except: link_harta = "lipsa localizare" print(link_harta) try: judet = soup.find("td", string = "Judeţ").find_next_sibling("td").contents[0] except: judet = "lipsa judet" print(judet) try: localitate = soup.find("td", string = "Localitate").find_next_sibling("td").contents[0] except: localitate = "lipsa localitate" print(localitate) try: comuna = soup.find("td", string = "Comuna").find_next_sibling("td").contents[0] except: comuna = "lipsa comuna" print(comuna) try: adresa = soup.find("td", string = "Adresa").find_next_sibling("td").contents[0] except: adresa = "lipsa adresa" print(adresa) try: protopopiat = soup.find("td", string = "Protopopiat").find_next_sibling("td").contents[0] except: protopopiat = "lipsa protopopiat" print(protopopiat) try: episcopie_arhiepiscopie = soup.find("td", string = "Episcopie/Arhiepiscopie").find_next_sibling("td").contents[0] except: episcopie_arhiepiscopie = "lipsa episcopie/arhiepiscopie" print(episcopie_arhiepiscopie) try: mitropolie = soup.find("td", string = "Mitropolie").find_next_sibling("td").contents[0] except: mitropolie = "lipsa mitropolie" print(mitropolie) try: LMI_2004 = soup.find("td", string = "Cod oficial LMI 2004").find_next_sibling("td").contents[0] except: LMI_2004 = "lipsa cod LMI 2004" print(LMI_2004) try: descriere = soup.find("td", string = "Descriere").find_next_sibling("td").contents[0] except: descriere = "lipsa descriere" print(descriere) rec.append([denumirea, parohia, datare, tip, link_harta, judet, localitate, comuna, adresa, protopopiat, episcopie_arhiepiscopie, mitropolie, LMI_2004, descriere]) print("__________________________________________________________________________________________________________") except Exception as e: exceptions_file = open(exceptions_file_name,'a') exceptions_file.write(str(e) + ": " + url + "\n") exceptions_file.close() error_links.append(url) time.sleep(randrange(3)) if count % 10 == 0: print("sleeping 5") time.sleep(5) df = pd.DataFrame(rec, columns = ['denumirea', 'parohia', 'datare', 'tip', 'link_harta', 'judet', 'localitate', 'comuna', 'adresa', 'protopopiat', 'episcopie_arhiepiscopie', 'mitropolie', 'cod_LMI', 'descriere']) df.to_csv(output_file_name, mode = 'a', header = not(path.exists(output_file_name)), index = False) # if the file doesn't exist, create it and write the dataframe with a header else it append the dataframe without header exceptions_file = open(exceptions_file_name,'a') exceptions_file.write("________________________________________________________________" + "\n") return error_links
# Quora host = 'https://www.quora.com' target = '/What-are-the-biggest-websites-built-with-Node-js-on-the-server-side' #hackernoon host = 'https://hackernoon.com' target = '/the-build-order-every-startup-should-follow-to-become-successful-635e7ed00fa3' for num in range(0, 10000): try: print(num) sleep(1) print('--fetchMulti 26--') ht = host + str(target) print('--fetchMulti 27--') ht = recode_uri(ht) print('--fetchMulti 28--') #hrefs = fetch.getQuoraUrls(conn, ht) hrefs = fetch.getHackerNoonUrls(conn, ht) prev = hrefs if (len(hrefs) > 0): prev = hrefs print(len(hrefs)) print('--fetchMulti 29--') target = recode_uri(numpy.random.choice(prev)) print('--fetchMulti 30--') ht = host + target print('--fetchMulti 31--') fetch.fetchQuora(conn, ht) print(ht)