def get_url_of_countries_nz(driver): info = {} LOGGER.info('Retrieving URL of all countries for New Zealand advisory') try: #this is the link to the first page url = 'https://safetravel.govt.nz/travel-advisories-destination' #set up the headless chrome driver driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup=BeautifulSoup(driver.page_source, 'lxml') #patter of the link to the country page that the href should match reg = regex.compile(r'\w+-*') table = soup.find('table') table_body = table.find('tbody') table_rows = table_body.find_all('tr') for tr in table_rows: cols = tr.find_all('td') cols = [ele.text.strip() for ele in cols] name = cols[1] a = tr.find('a', attrs = {'href':reg}) info[name] = {"href":a['href']} LOGGER.success(f'URL for {name} was successfully retrieved') LOGGER.success('Successfully retrieved URL of all countries for the New Zealand advisory') except Exception as error_msg: LOGGER.error(f'An error has occured while retrieving the URLs of {name} for New Zealand advisory because of the following error: {error_msg}') finally: quit_driver(driver) return info
def parse_one_country_vaccine(url, country): driver = create_driver() driver.get(url) vaccines = {} LOGGER.info( f'Parsing the informations for vaccinations for the following country: {country}' ) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') # table_row = soup.find_all count = 0 for tbody in soup.findAll('tbody'): for row in tbody.findAll('tr'): name = row.find('td', {"class": "traveler-disease"}) info = row.find('td', {"class": "traveler-findoutwhy"}) if name and info: name = name.text.strip('/\n') if count == 0: info = info.text.replace('\n', '') else: info = info.text.replace('\n', ' ') vaccines[name] = info count = count + 1 quit_driver(driver) save_one_country(vaccines, country) print(vaccines) return vaccines
def find_a_post(location, request_id, i=1): LOGGER.info(f'Starting the parser for the following location: {location}') driver = create_driver() location = location.replace(' ', '') url = instagram_url + location + "/" try: LOGGER.info(f'Retreiving the link to the image page for: {location}') driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') garb_all = soup.find_all('a', {'href': regex.compile(r'/p/')}) except: LOGGER.error( f'Could not get the link to the image page for: {location}') exit count = 0 for g in garb_all: count += 1 if count > i: break u = "https://www.instagram.com" + g.get('href') try: image_info = get_image_info(driver, u) LOGGER.success(f'Image info for: {location}') except: LOGGER.error( f'Could not get the info of the image for: {location}') count -= 1 try: save_img_url(image_info['image_link'], 'images_to_filter/check.jpg') selfie = check_if_selfie('images_to_filter/check.jpg') group_photo = check_if_group_photo('images_to_filter/check.jpg') objects_too_big = check_for_objects('images_to_filter/check.jpg') too_much_similar_colors = find_nearest_colors( 'images_to_filter/check.jpg') if not selfie and not group_photo and not objects_too_big and not too_much_similar_colors and not check_if_wrong_geolocation( location, image_info['geolocation']): save_image("images", image_info, location, str(request_id)) LOGGER.success(f'Saved Image info for: {location}') return True else: failed_img = Image.open('images_to_filter/check.jpg') failed_img.save( f'images_to_filter/discarded/{get_last_discarded()}.jpg') LOGGER.error( f'Cannot save image. It is now in images_to_filter/discared/ ' ) count -= 1 except: LOGGER.error( f'Could not save the info of the image for: {location}') count -= 1 quit_driver(driver)
def save_to_MU(): LOGGER.info(f'Saving and parsing Mauritius into the databse') driver = create_driver() LOGGER.info('Begin parsing for Mauritius advisory') try: wiki_visa_url = wiki_visa_url_MU wiki_visa_ob = wiki_visa_parser(wiki_visa_url, driver) visas = wiki_visa_ob.visa_parser_table() LOGGER.success( 'Parsing for Mauritius advisory has been successfully completed') except Exception as error_msg: LOGGER.error( f'Error has occured while parsing for Mauritius advisory because of the following error: {error_msg}' ) info = {} array_info = [] # create an an sqlite_advisory object db = Database("countries.sqlite") db.drop_table("MU") db.add_table("MU", country_iso="text", name="text", advisory_text="text", visa_info="text") LOGGER.info('Saving Mauritius table into the database') try: for country in visas: iso = find_iso_of_country(country) if (iso != ""): name = country LOGGER.info(f'Saving {name}') visa = visas[country].get( 'visa') #dictionary for visa info is country{visa:text} advisory = "Not available yet" info = { "country_iso": iso, "name": name, "advisory": advisory, "visa_info": visa } array_info.append(info) print(name, " ", visa, " ", advisory) db.insert("MU", iso, name, advisory, visa) LOGGER.success( f'{name} was sucessfully saved to the database with the following information: {visa}. {advisory}.' ) LOGGER.success( 'Mauritius table successfully saved to the database') except Exception as error_msg: LOGGER.error( f'An error has occured while saving Mauritius table to the database because of the following error: {error_msg}' ) db.close_connection() quit_driver(driver) with open('./advisory-mu.json', 'w') as outfile: json.dump(array_info, outfile)
def find_all_ireland(): LOGGER.info("Begin parsing and saving for Ireland...") my_driver = create_driver() all_url = find_all_url(my_driver) data = find_all_iso(all_url) LOGGER.info( 'Parsing visa requirements for all countries for the Ireland advisory') try: wiki_visa_ob = wiki_visa_parser( "https://en.wikipedia.org/wiki/Visa_requirements_for_Irish_citizens", my_driver) visas = wiki_visa_ob.visa_parser_table() except Exception as error_msg: LOGGER.error( f'An error has occured while getting the visa requirements for Ireland advisory because of the following error: {error_msg}' ) for country in data: c = data[country] url = c['href'] my_driver.implicitly_wait(5) my_driver.get(url) soup = BeautifulSoup(my_driver.page_source, 'lxml') c['visa-info'] = get_one_info(url, 'visa/passport', my_driver, soup) c['advisory-text'] = get_one_advisory(url, my_driver, soup) c['name'] = country if c['visa-info'] == '': c['visa-info'] = get_one_info(url, 'Entry requirements', my_driver, soup) iso = c['country-iso'] #handling some exceptions, had to do research if iso == 'AI': c['visa-info'] = 'Visa not required for 3 months' elif iso == 'BM': c['visa-info'] = 'Visa not required for 21 days (extendable)' elif iso == 'MQ': iso = 'FR' elif iso == 'MS': c['visa-info'] = 'Visa not required for 6 months' elif iso == 'RE': iso = 'FR' else: try: c['visa-info'] = visas[country].get( 'visa') + "<br>" + c['visa-info'] except Exception as error_msg: print(c, error_msg) LOGGER.warning(f'Error message: {error_msg}') #dump the data into js to be deleted later quit_driver(my_driver) with open('./advisory-ie.json', 'w') as outfile: json.dump(data, outfile) save_into_db(data) #find_all_ireland()
def save_to_united_states(): LOGGER.info("Begin parsing and saving for United States table...") driver = create_driver() data = {} #Used to store of all the parsed data of each country name_to_advisories = {} #Stores the names and associated advisories LOGGER.info( f'Retrieving visa requirements for all countries for the United States advisory' ) name_advisory = get_name_and_advisory_of_countries() wiki_visa_url = "https://en.wikipedia.org/wiki/Visa_requirements_for_United_States_citizens" wiki_visa_ob = wiki_visa_parser(wiki_visa_url, driver) visas = wiki_visa_ob.visa_parser_table() LOGGER.success( 'Successfully retrieved visa requirements for all countries for the United States advisory' ) for name in sorted(name_advisory.keys( )): #Sorts the dictionary containing names and advisories name_to_advisories[name] = name_advisory[name] counter_country = 0 for country in name_to_advisories: #iterates through name_to_advisories to retrieve advisories driver.implicitly_wait(5) name = country advisory = name_to_advisories[country] visa_text = "" for countryVisa in visas: # iterates through list of visas to retrieve visas if (countryVisa == country): visa_text = visas[countryVisa].get('visa') del visas[countryVisa] break country_iso = "na" data[name] = { 'country-iso': country_iso, 'name': name, 'advisory-text': advisory, 'visa-info': visa_text } if ((counter_country % 50) == 0): quit_driver(driver) driver = create_driver() counter_country += 1 data = find_all_iso(data) #Sets iso for each country with open('./advisory-us.json', 'w') as outfile: json.dump(data, outfile) save_into_db(data)
def parse_one_country_advisory(url, href): driver = create_driver() driver.get(url) advisory="" #Selenium hands the page source to Beautiful Soup soup=BeautifulSoup(driver.page_source, 'lxml') advisory_div = soup.find("div", {"class": "gem-c-govspeak govuk-govspeak direction-ltr"}) advisory_paragraph1 = advisory_div.findAll("p")[0] advisory_paragraph2 = advisory_div.findAll("p")[1] advisory = advisory_paragraph1.text +" "+advisory_paragraph2.text quit_driver(driver) return advisory
def parse_one_country_advisory(url): driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') advisory_paragraph1 = "" try: #The html are made differently for certain countries pages advisory_div = soup.findAll( "div", { "class": "acc-content ui-accordion-content ui-corner-bottom ui-helper-reset ui-widget-content ui-accordion-content-active" })[1] advisory_paragraph = advisory_div.findAll("span")[0].text advisory_paragraph1 = advisory_paragraph.split('\n')[0] except IndexError: try: advisory_div = soup.findAll( "div", { "class": "acc-content ui-accordion-content ui-corner-bottom ui-helper-reset ui-widget-content ui-accordion-content-active" })[1] advisory_paragraph = advisory_div.findAll("p")[0].text advisory_paragraph1 = advisory_paragraph.split('\n')[0] except IndexError: try: advisory_div = soup.findAll( "div", { "class": "acc-content ui-accordion-content ui-corner-bottom ui-helper-reset ui-widget-content ui-accordion-content-active" })[1] advisory_paragraph = advisory_div.text advisory_paragraph1 = advisory_paragraph.split('\n')[1] except IndexError: try: advisory_div = soup.findAll("div", {"class": "alert-section"})[0] advisory_paragraph = advisory_div.findAll("p")[0].text advisory_paragraph1 = advisory_paragraph.split('\n')[0] except IndexError: advisory_div = soup.findAll("div", {"class": "space"})[0] advisory_paragraph = advisory_div.findAll("p")[1].text advisory_paragraph1 = advisory_paragraph.split('\n')[0] advisory_paragraph1 = advisory_paragraph1.lstrip() LOGGER.info({advisory_paragraph1}) quit_driver(driver) return advisory_paragraph1
def get_url_of_countries(): info = {} try: #this is the link to the first page url = 'https://smartraveller.gov.au/countries/pages/list.aspx' LOGGER.info( 'Retrieving the URLs for all countries for the Australian advisory' ) # create a new chrome session driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') #patter of the link to the country page that the href should match reg = regex.compile(r'\/destinations\/\w+-*\w*\/\w+-*\w*') table = soup.find('table') table_body = table.find('tbody') table_rows = table_body.find_all('tr') for tr in table_rows: cols = tr.find_all('td') cols = [ele.text.strip() for ele in cols] if (cols[2] == ''): cols[2] = 'No advisory from the australian government' name = cols[0] advisory_text = cols[2] a = tr.find('a', attrs={'href': reg}) if (a != None): href = a['href'] info[name] = {"href": href, "advisory-text": advisory_text} LOGGER.success(f'Retrieved URL for {name}') LOGGER.success( 'Successfully retrieved the URLs for all countries of the Australian advisory' ) except: LOGGER.error( 'An error has occured while retrieving the URLs for all countries for the Australian advisory' ) finally: quit_driver(driver) return info
def save_to_new_zealand(): LOGGER.info("Begin parsing and saving for New Zealand table...") driver = create_driver() data = {} #Used to store of all the parsed data of each country url = get_url_of_countries_nz(driver) #this function create its own driver -- to change LOGGER.info('Retrieving visa requirements for New Zealand advisory') try: wiki_visa_url = "https://en.wikipedia.org/wiki/Visa_requirements_for_New_Zealand_citizens" wiki_visa_ob = wiki_visa_parser(wiki_visa_url,driver) visas = wiki_visa_ob.visa_parser_table()# Used to acquire visa info of each country LOGGER.success('Succesfully retrieved visa requirements of all countries for New Zealand advisory') except Exception as error_msg: LOGGER.error(f'An error has occured while retrieving visa requirement for New Zealand adviosry because of the following error: {error_msg}') counter_country = 0 for country in url: #iterates through urls to retrieve advisory information driver.implicitly_wait(5) name = country href = url[country].get("href") link = "https://safetravel.govt.nz/{}".format(href,sep='') advisory = parse_a_country_advisory(link,driver) visa_text= "" for countryVisa in visas: # iterates through list of visas to retrieve visas if(countryVisa == country): visa_text = visas[countryVisa].get('visa') del visas[countryVisa] break; country_iso = "na" data[name] = {'country-iso':country_iso,'name':name,'advisory-text':advisory,'visa-info':visa_text} if ((counter_country%50) == 0): quit_driver(driver) driver = create_driver() counter_country += 1 data = find_all_iso(data)#Sets iso for each country with open('./advisory-nz.json', 'w') as outfile: json.dump(data, outfile) save_into_db(data)
def save_to_weather(): #Antigua and Barbuda LOGGER.info(f'Beginning parsing for average monthly temperature') avg_monthly_temperature = '' try: driver = create_driver() wiki_temperature = wiki_weather_parser(wiki_visa_temperature, driver) avg_monthly_temperature = wiki_temperature.visa_parser_table() LOGGER.success( f'Following data was retrieved: {avg_monthly_temperature}') save_into_db('weather', avg_monthly_temperature) quit_driver(driver) except Exception as error_msg: LOGGER.error( f'An error has occured while parsing for temperature because of the following error: {error_msg}' )
def get_all_links(): LOGGER.info('Retrieving the URLs for all countries for unsafe areas') iso_list = config.iso_list data = {} #home page link home = 'https://travel.gc.ca/travelling/advisories' driver = create_driver() driver.get(home) try: soup = BeautifulSoup(driver.page_source, 'lxml') table = soup.find('table', attrs={'id': 'reportlist'}) tbody = table.find('tbody') rows = tbody.findAll('tr') #parse the table get the link in the <a> tag for row in rows: col1 = row.find('a') name = col1.text href = col1['href'] #the iso function accepts a dictionary with a key as name if (name == "Canary Islands"): data[name] = {'href': href, 'country-iso': 'CI'} elif (name == "Saint Vincent & the Grenadines"): name = "Saint Vincent and the Grenadines" elif (name == "Virgin Islands (U.S.)"): name = "United States Virgin Islands" data[name] = {'href': href} LOGGER.success(f'Retrieved the URL for {name}') LOGGER.success('Retrieved all the URLs for unsafe areas') except Exception as error_msg: LOGGER.error( f'An error has occured while retrieving the URLs for all countries from the canadian travel website because of the following error: {error_msg}' ) finally: quit_driver(driver) data = find_all_iso(data) return data
def translate(iso_language): #parse the languages 10 by 10 to track any error more easily count = 0 driver = create_driver() for lg in iso_language: for p in PHRASES: iso = iso_language[lg] p_edit = p.replace(" ", "%20") url = 'https://translate.google.com/?sl=en&tl=' + iso + '&text=' + p_edit try: driver = create_driver() driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') except: LOGGER.error(f'Could not parse {lg}') continue try: translation = soup.find('span', { 'class': 'tlid-translation translation' }).text pronunciation = soup.find_all( 'div', { 'class': 'tlid-transliteration-content transliteration-content full' })[1].text except: LOGGER.info( f'Could not find data for {lg}; will ne replace by -') translation = "-" pronunciation = "-" DB.insert('phrases', iso, lg, p, translation, pronunciation) count += 1 if count == 10: quit_driver(driver) return quit_driver(driver) return
def save_to_unsafe_areas(): driver = create_driver() all_countries = get_all_links() data = {} for country in all_countries: name = all_countries[country] href = name['href'] url = "https://travel.gc.ca" + href regional_advisory = get_regional_advisories(url, driver) data[country] = {'unsafe_areas': regional_advisory} #canada special case data['Canada'] = { 'unsafe_areas': 'There is no regional advisory, take security precautions based on the general advisory for this country.' } data = find_all_iso(data) save_regional_advisories(data) quit_driver(driver)
def save_to_SG(): LOGGER.info(f'Saving Singapore into the databse') driver = create_driver() LOGGER.info( 'Parsing visa requirments for all countries into the Singapore table') try: wiki_visa_url = wiki_visa_url_SG wiki_visa_ob = wiki_visa_parser(wiki_visa_url, driver) visas = wiki_visa_ob.visa_parser_table() LOGGER.success( 'Visa requirements have been succesfully parsed for the Singapore table' ) except Exception as error_msg: LOGGER.error( f'An error has occured whilse parsing for visa requirements because of the following error: {error_msg}' ) advisories = parse_all_countries_advisories() array_info = [] # create an an sqlite_advisory object db = Database("countries.sqlite") db.drop_table("SG") db.add_table("SG", country_iso="text", name="text", advisory_text="text", visa_info="text") array_info = save_info(db, visas, advisories, array_info) db.close_connection() LOGGER.success(f'Singapore was sucesfully saved to the database') quit_driver(driver) with open('./advisory-sg.json', 'w') as outfile: json.dump(array_info, outfile)
def get_additional_advisory_info_url(): url = 'https://travel.gc.ca/travelling/advisories' #set up the headless chrome driver driver = create_driver() driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') table = soup.find('table') table_body = table.find('tbody') table_rows = table_body.find_all('tr', attrs={'class': 'gradeX'}) additional_advisory = {} for row in table_rows: cols = row.find_all('td') country = cols[0].text iso = find_iso_of_country(country) advisory = cols[2].text additional_advisory[iso] = { 'country_name': country, 'advisory_text': advisory } quit_driver(driver) return additional_advisory
save_into_db(data) def all_unsafe_areas(): url = get_url_of_countries( ) #this function create its own driver -- to change data = {} driver = create_driver() LOGGER.info('Retrieving all unsafe areas') for country in url: href = url[country].get('href') link = "https://smartraveller.gov.au{}".format(href, sep='') unsafe_areas = regional_advice_level(driver, link) data[country] = {'unsafe_areas': unsafe_areas} LOGGER.info(f'{data[country]}') data = find_all_iso(data) driver.quit() #saving the data in json file with open('unsafe-areas-au.json', 'w') as fp: json.dump(data, fp) # save_to_australia() driver = create_driver() data = regional_advice_level( driver, "https://www.smartraveller.gov.au/destinations/africa/mali") quit_driver(driver) #save_to_australia()