def save_to_united_states(): LOGGER.info("Begin parsing and saving for United States table...") driver = create_driver() data = {} #Used to store of all the parsed data of each country name_to_advisories = {} #Stores the names and associated advisories LOGGER.info( f'Retrieving visa requirements for all countries for the United States advisory' ) name_advisory = get_name_and_advisory_of_countries() wiki_visa_url = "https://en.wikipedia.org/wiki/Visa_requirements_for_United_States_citizens" wiki_visa_ob = wiki_visa_parser(wiki_visa_url, driver) visas = wiki_visa_ob.visa_parser_table() LOGGER.success( 'Successfully retrieved visa requirements for all countries for the United States advisory' ) for name in sorted(name_advisory.keys( )): #Sorts the dictionary containing names and advisories name_to_advisories[name] = name_advisory[name] counter_country = 0 for country in name_to_advisories: #iterates through name_to_advisories to retrieve advisories driver.implicitly_wait(5) name = country advisory = name_to_advisories[country] visa_text = "" for countryVisa in visas: # iterates through list of visas to retrieve visas if (countryVisa == country): visa_text = visas[countryVisa].get('visa') del visas[countryVisa] break country_iso = "na" data[name] = { 'country-iso': country_iso, 'name': name, 'advisory-text': advisory, 'visa-info': visa_text } if ((counter_country % 50) == 0): quit_driver(driver) driver = create_driver() counter_country += 1 data = find_all_iso(data) #Sets iso for each country with open('./advisory-us.json', 'w') as outfile: json.dump(data, outfile) save_into_db(data)
def get_url_of_countries_nz(driver): info = {} LOGGER.info('Retrieving URL of all countries for New Zealand advisory') try: #this is the link to the first page url = 'https://safetravel.govt.nz/travel-advisories-destination' #set up the headless chrome driver driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup=BeautifulSoup(driver.page_source, 'lxml') #patter of the link to the country page that the href should match reg = regex.compile(r'\w+-*') table = soup.find('table') table_body = table.find('tbody') table_rows = table_body.find_all('tr') for tr in table_rows: cols = tr.find_all('td') cols = [ele.text.strip() for ele in cols] name = cols[1] a = tr.find('a', attrs = {'href':reg}) info[name] = {"href":a['href']} LOGGER.success(f'URL for {name} was successfully retrieved') LOGGER.success('Successfully retrieved URL of all countries for the New Zealand advisory') except Exception as error_msg: LOGGER.error(f'An error has occured while retrieving the URLs of {name} for New Zealand advisory because of the following error: {error_msg}') finally: quit_driver(driver) return info
def get_url_of_for_letter(dictionnary, letter): try: #this is the link to the first page url = 'https://www.mfa.gov.sg/Where-Are-You-Travelling-To?letter={}'.format( letter, sep='') driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') #All countries for a given letter countries = soup.findAll("a", {"class": "embassy-dropdown"}) # #retrieving links for all countries of the alphabet for country in countries: country_name = country.text.lstrip().rstrip() country_iso = find_iso_of_country(country_name) if (country_iso != "" ): #Countries that don't have iso are not official counntries href = country['href'] dictionnary[country_name] = {"href": href} finally: driver.close() driver.quit() return dictionnary
def get_url_of_countries(): info = {} LOGGER.info('Retrieving URL of all countries for United Kingdom advisory') try: #this is the link to the first page url = 'https://www.gov.uk/foreign-travel-advice' driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup=BeautifulSoup(driver.page_source, 'lxml') #patter of the link to the country page that the href should match countries_div = soup.findAll("div", {"class": "govuk-grid-column-two-thirds"})[1] countries = countries_div.findAll('a') #retrieving links for all countries for country in countries: country_name = country.text country_iso = find_iso_of_country(country_name) if(country_iso != ""): #Countries that don't have iso are not official counntries href = country['href'] info[country_iso] = {"href":href} LOGGER.success(f'URL of {country_name} was successfully retrieved') except Exception as error_msg: LOGGER.error(f'An error has occured while retrieving URL of countries for United Kingdom advisory because of the following error: {error_msg}') finally: driver.close() driver.quit() return info
def parse_one_country_vaccine(url, country): driver = create_driver() driver.get(url) vaccines = {} LOGGER.info( f'Parsing the informations for vaccinations for the following country: {country}' ) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') # table_row = soup.find_all count = 0 for tbody in soup.findAll('tbody'): for row in tbody.findAll('tr'): name = row.find('td', {"class": "traveler-disease"}) info = row.find('td', {"class": "traveler-findoutwhy"}) if name and info: name = name.text.strip('/\n') if count == 0: info = info.text.replace('\n', '') else: info = info.text.replace('\n', ' ') vaccines[name] = info count = count + 1 quit_driver(driver) save_one_country(vaccines, country) print(vaccines) return vaccines
def save_to_MU(): LOGGER.info(f'Saving and parsing Mauritius into the databse') driver = create_driver() LOGGER.info('Begin parsing for Mauritius advisory') try: wiki_visa_url = wiki_visa_url_MU wiki_visa_ob = wiki_visa_parser(wiki_visa_url, driver) visas = wiki_visa_ob.visa_parser_table() LOGGER.success( 'Parsing for Mauritius advisory has been successfully completed') except Exception as error_msg: LOGGER.error( f'Error has occured while parsing for Mauritius advisory because of the following error: {error_msg}' ) info = {} array_info = [] # create an an sqlite_advisory object db = Database("countries.sqlite") db.drop_table("MU") db.add_table("MU", country_iso="text", name="text", advisory_text="text", visa_info="text") LOGGER.info('Saving Mauritius table into the database') try: for country in visas: iso = find_iso_of_country(country) if (iso != ""): name = country LOGGER.info(f'Saving {name}') visa = visas[country].get( 'visa') #dictionary for visa info is country{visa:text} advisory = "Not available yet" info = { "country_iso": iso, "name": name, "advisory": advisory, "visa_info": visa } array_info.append(info) print(name, " ", visa, " ", advisory) db.insert("MU", iso, name, advisory, visa) LOGGER.success( f'{name} was sucessfully saved to the database with the following information: {visa}. {advisory}.' ) LOGGER.success( 'Mauritius table successfully saved to the database') except Exception as error_msg: LOGGER.error( f'An error has occured while saving Mauritius table to the database because of the following error: {error_msg}' ) db.close_connection() quit_driver(driver) with open('./advisory-mu.json', 'w') as outfile: json.dump(array_info, outfile)
def find_a_post(location, request_id, i=1): LOGGER.info(f'Starting the parser for the following location: {location}') driver = create_driver() location = location.replace(' ', '') url = instagram_url + location + "/" try: LOGGER.info(f'Retreiving the link to the image page for: {location}') driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') garb_all = soup.find_all('a', {'href': regex.compile(r'/p/')}) except: LOGGER.error( f'Could not get the link to the image page for: {location}') exit count = 0 for g in garb_all: count += 1 if count > i: break u = "https://www.instagram.com" + g.get('href') try: image_info = get_image_info(driver, u) LOGGER.success(f'Image info for: {location}') except: LOGGER.error( f'Could not get the info of the image for: {location}') count -= 1 try: save_img_url(image_info['image_link'], 'images_to_filter/check.jpg') selfie = check_if_selfie('images_to_filter/check.jpg') group_photo = check_if_group_photo('images_to_filter/check.jpg') objects_too_big = check_for_objects('images_to_filter/check.jpg') too_much_similar_colors = find_nearest_colors( 'images_to_filter/check.jpg') if not selfie and not group_photo and not objects_too_big and not too_much_similar_colors and not check_if_wrong_geolocation( location, image_info['geolocation']): save_image("images", image_info, location, str(request_id)) LOGGER.success(f'Saved Image info for: {location}') return True else: failed_img = Image.open('images_to_filter/check.jpg') failed_img.save( f'images_to_filter/discarded/{get_last_discarded()}.jpg') LOGGER.error( f'Cannot save image. It is now in images_to_filter/discared/ ' ) count -= 1 except: LOGGER.error( f'Could not save the info of the image for: {location}') count -= 1 quit_driver(driver)
def find_all_ireland(): LOGGER.info("Begin parsing and saving for Ireland...") my_driver = create_driver() all_url = find_all_url(my_driver) data = find_all_iso(all_url) LOGGER.info( 'Parsing visa requirements for all countries for the Ireland advisory') try: wiki_visa_ob = wiki_visa_parser( "https://en.wikipedia.org/wiki/Visa_requirements_for_Irish_citizens", my_driver) visas = wiki_visa_ob.visa_parser_table() except Exception as error_msg: LOGGER.error( f'An error has occured while getting the visa requirements for Ireland advisory because of the following error: {error_msg}' ) for country in data: c = data[country] url = c['href'] my_driver.implicitly_wait(5) my_driver.get(url) soup = BeautifulSoup(my_driver.page_source, 'lxml') c['visa-info'] = get_one_info(url, 'visa/passport', my_driver, soup) c['advisory-text'] = get_one_advisory(url, my_driver, soup) c['name'] = country if c['visa-info'] == '': c['visa-info'] = get_one_info(url, 'Entry requirements', my_driver, soup) iso = c['country-iso'] #handling some exceptions, had to do research if iso == 'AI': c['visa-info'] = 'Visa not required for 3 months' elif iso == 'BM': c['visa-info'] = 'Visa not required for 21 days (extendable)' elif iso == 'MQ': iso = 'FR' elif iso == 'MS': c['visa-info'] = 'Visa not required for 6 months' elif iso == 'RE': iso = 'FR' else: try: c['visa-info'] = visas[country].get( 'visa') + "<br>" + c['visa-info'] except Exception as error_msg: print(c, error_msg) LOGGER.warning(f'Error message: {error_msg}') #dump the data into js to be deleted later quit_driver(my_driver) with open('./advisory-ie.json', 'w') as outfile: json.dump(data, outfile) save_into_db(data) #find_all_ireland()
def save_to_new_zealand(): LOGGER.info("Begin parsing and saving for New Zealand table...") driver = create_driver() data = {} #Used to store of all the parsed data of each country url = get_url_of_countries_nz(driver) #this function create its own driver -- to change LOGGER.info('Retrieving visa requirements for New Zealand advisory') try: wiki_visa_url = "https://en.wikipedia.org/wiki/Visa_requirements_for_New_Zealand_citizens" wiki_visa_ob = wiki_visa_parser(wiki_visa_url,driver) visas = wiki_visa_ob.visa_parser_table()# Used to acquire visa info of each country LOGGER.success('Succesfully retrieved visa requirements of all countries for New Zealand advisory') except Exception as error_msg: LOGGER.error(f'An error has occured while retrieving visa requirement for New Zealand adviosry because of the following error: {error_msg}') counter_country = 0 for country in url: #iterates through urls to retrieve advisory information driver.implicitly_wait(5) name = country href = url[country].get("href") link = "https://safetravel.govt.nz/{}".format(href,sep='') advisory = parse_a_country_advisory(link,driver) visa_text= "" for countryVisa in visas: # iterates through list of visas to retrieve visas if(countryVisa == country): visa_text = visas[countryVisa].get('visa') del visas[countryVisa] break; country_iso = "na" data[name] = {'country-iso':country_iso,'name':name,'advisory-text':advisory,'visa-info':visa_text} if ((counter_country%50) == 0): quit_driver(driver) driver = create_driver() counter_country += 1 data = find_all_iso(data)#Sets iso for each country with open('./advisory-nz.json', 'w') as outfile: json.dump(data, outfile) save_into_db(data)
def translate(iso_language): #parse the languages 10 by 10 to track any error more easily count = 0 driver = create_driver() for lg in iso_language: for p in PHRASES: iso = iso_language[lg] p_edit = p.replace(" ", "%20") url = 'https://translate.google.com/?sl=en&tl=' + iso + '&text=' + p_edit try: driver = create_driver() driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') except: LOGGER.error(f'Could not parse {lg}') continue try: translation = soup.find('span', { 'class': 'tlid-translation translation' }).text pronunciation = soup.find_all( 'div', { 'class': 'tlid-transliteration-content transliteration-content full' })[1].text except: LOGGER.info( f'Could not find data for {lg}; will ne replace by -') translation = "-" pronunciation = "-" DB.insert('phrases', iso, lg, p, translation, pronunciation) count += 1 if count == 10: quit_driver(driver) return quit_driver(driver) return
def save_to_UK(): LOGGER.info("Begin parsing and saving for United Kingdom table...") driver = create_driver() LOGGER.info('Parsing the visa requirements of all countries for United Kingdom advisory') try: wiki_visa_url ="https://en.wikipedia.org/wiki/Visa_requirements_for_British_citizens" wiki_visa_ob = wiki_visa_parser(wiki_visa_url,driver) visas = wiki_visa_ob.visa_parser_table() data = parse_all_countries_advisory() LOGGER.success('Successfully parsed the visa requirements of all countries for United Kingdom advisory') except Exception as error_msg: LOGGER.error(f'An error has occured while retrieving the visa reuirements of all countries for United Kingdom advisory because of the following error: {error_msg}') info = {} array_info = [] # create an an sqlite_advisory object] db = Database("countries.sqlite") db.drop_table("GB") db.add_table("GB", country_iso="text", name="text", advisory_text="text", visa_info="text") LOGGER.info('Saving countries informations into the UK table') try: for country in visas: iso = find_iso_of_country(country) if(iso != ""): try: name = country advisory = data[iso].get('advisory') #dictionary for the travel advisory is iso{advisory:text} visa_info = visas[country].get('visa') #dictionary for visa info is country{visa:text} info = { "country_iso" : iso, "name": name, "advisory": advisory, "visa_info": visa_info } array_info.append(info) LOGGER.success(f"Saving {name} into the UK table with the following information: {visa_info}. {advisory}") db.insert("GB",iso,name,advisory,visa_info) LOGGER.success(f'{name} sucesfully saved to the database.') except KeyError: LOGGER.warning(f'This country doesn\'t have advisory info: {country}') print("This country doesn't have advisory info: ",country) LOGGER.info(f'Its ISO is {iso}') print("Its ISO is: ",iso) LOGGER.success('All countries have been succesfully saved into the UK table') except Exception as error_msg: LOGGER.error(f'An error has occured while saving countries into the UK table because of the following: {error_msg}') db.close_connection() with open('./advisory-uk.json', 'w') as outfile: json.dump(array_info, outfile)
def parse_one_country_advisory(url, href): driver = create_driver() driver.get(url) advisory="" #Selenium hands the page source to Beautiful Soup soup=BeautifulSoup(driver.page_source, 'lxml') advisory_div = soup.find("div", {"class": "gem-c-govspeak govuk-govspeak direction-ltr"}) advisory_paragraph1 = advisory_div.findAll("p")[0] advisory_paragraph2 = advisory_div.findAll("p")[1] advisory = advisory_paragraph1.text +" "+advisory_paragraph2.text quit_driver(driver) return advisory
def parse_one_country_advisory(url): driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') advisory_paragraph1 = "" try: #The html are made differently for certain countries pages advisory_div = soup.findAll( "div", { "class": "acc-content ui-accordion-content ui-corner-bottom ui-helper-reset ui-widget-content ui-accordion-content-active" })[1] advisory_paragraph = advisory_div.findAll("span")[0].text advisory_paragraph1 = advisory_paragraph.split('\n')[0] except IndexError: try: advisory_div = soup.findAll( "div", { "class": "acc-content ui-accordion-content ui-corner-bottom ui-helper-reset ui-widget-content ui-accordion-content-active" })[1] advisory_paragraph = advisory_div.findAll("p")[0].text advisory_paragraph1 = advisory_paragraph.split('\n')[0] except IndexError: try: advisory_div = soup.findAll( "div", { "class": "acc-content ui-accordion-content ui-corner-bottom ui-helper-reset ui-widget-content ui-accordion-content-active" })[1] advisory_paragraph = advisory_div.text advisory_paragraph1 = advisory_paragraph.split('\n')[1] except IndexError: try: advisory_div = soup.findAll("div", {"class": "alert-section"})[0] advisory_paragraph = advisory_div.findAll("p")[0].text advisory_paragraph1 = advisory_paragraph.split('\n')[0] except IndexError: advisory_div = soup.findAll("div", {"class": "space"})[0] advisory_paragraph = advisory_div.findAll("p")[1].text advisory_paragraph1 = advisory_paragraph.split('\n')[0] advisory_paragraph1 = advisory_paragraph1.lstrip() LOGGER.info({advisory_paragraph1}) quit_driver(driver) return advisory_paragraph1
def parse_all_countries_advisory(): data = {} urls = get_url_of_countries() driver = create_driver() for country in urls: href = urls[country].get("href") link = "https://www.gov.uk{}".format(href,sep='') advisory = parse_one_country_advisory(link,href) link = "https://www.gov.uk{}/safety-and-security".format(href,sep='') additional_advisory_info = parse_additional_advisory_info(link, driver) data[country]= {"advisory": advisory + additional_advisory_info} return data
def save_to_australia(): LOGGER.info("Begin parsing and saving for Australia table...") url = get_url_of_countries( ) #this function create its own driver -- to change data = {} driver = create_driver() try: LOGGER.info( 'Parsing visa requirements for all countries for the Australian advisory' ) wiki_visa_url = 'https://en.wikipedia.org/wiki/Visa_requirements_for_Australian_citizens' wiki_visa_ob = wiki_visa_parser(wiki_visa_url, driver) wiki_visa = wiki_visa_ob.visa_parser_table() except Exception as error_msg: LOGGER.error( f'An error has occured while retrieving the visa requirements for all countries for the Australian advisory because of following error: {error_msg}' ) for country in url: driver.implicitly_wait(5) name = country href = url[country].get('href') advisory_text = url[country].get('advisory-text') link = "https://smartraveller.gov.au{}".format(href, sep='') additional_advisory = get_additional_advisory(link, driver) advisory_text = advisory_text + additional_advisory LOGGER.info(f"Begin parsing {name} to insert into AU table") visa_info = parse_a_country(link, driver, 'Visas') LOGGER.success( f"The following information was retrieved for {name}: {visa_info}. {advisory_text}" ) if (visa_info == ''): try: visa_info = wiki_visa[name].get('visa') + "<br>" + visa_info except: LOGGER.warning(f"No visa info for {name}") country_iso = "na" data[name] = { 'country-iso': country_iso, 'name': name, 'advisory-text': advisory_text, 'visa-info': visa_info } driver.quit() data = find_all_iso(data) save_into_db(data)
def get_country_traffic_side(): array_of_country_info = [] already_parsed = [] try: # this is the link to the first page url = 'https://www.worldstandards.eu/cars/list-of-left-driving-countries/' driver = create_driver() driver.get(url) # Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'html.parser') table = soup.find('table') tbody = table.find('tbody') allRows = tbody.findAll('tr') for country_row in allRows: isHeader = country_row.find( 'th') != None #The header row should be discarded if (not isHeader): country = country_row.findAll('td')[0].text traffic_side = country_row.findAll('td')[1].text country_name_has_bracket = country.find('(') if (country_name_has_bracket > -1 ): #We want to remove the bracket from the country name country = country[0:country_name_has_bracket] country_iso = find_iso_of_country(country) if (country_iso != ""): if country_iso not in already_parsed: # Only parse the main traffic side of a country if "left" in traffic_side: traffic_side = "left" else: traffic_side = "right" info = { "country_iso": country_iso, "country_name": country, "traffic_side": traffic_side } already_parsed.append(country_iso) array_of_country_info.append(info) else: print( "The main land for this country is parsed already", country) return array_of_country_info finally: driver.close() driver.quit()
def save_to_weather(): #Antigua and Barbuda LOGGER.info(f'Beginning parsing for average monthly temperature') avg_monthly_temperature = '' try: driver = create_driver() wiki_temperature = wiki_weather_parser(wiki_visa_temperature, driver) avg_monthly_temperature = wiki_temperature.visa_parser_table() LOGGER.success( f'Following data was retrieved: {avg_monthly_temperature}') save_into_db('weather', avg_monthly_temperature) quit_driver(driver) except Exception as error_msg: LOGGER.error( f'An error has occured while parsing for temperature because of the following error: {error_msg}' )
def get_countries_cocainelaw(): LOGGER.info("Retrieving information for cocaine") try: # this is the link to the first page url = 'https://en.wikipedia.org/wiki/Legal_status_of_cocaine' driver = create_driver() driver.get(url) # Selenium hands the page source to Beautiful Soup soup=BeautifulSoup(driver.page_source, 'html.parser') # patter of the link to the country page that the href should match table = soup.find('table', {'class':"wikitable"}) tbody = table.find('tbody') table_rows = tbody.find_all('tr') cocaine_info= {} arrayCocaineInfo = {} for tablerow in table_rows: table_columns = tablerow.find_all('td') if(len(table_columns)>0): country_name= table_columns[0].text cocaine_possession= table_columns[1].text cocaine_possession= re.sub(r'\[\d*\]',' ',cocaine_possession.rstrip()) cocaine_sale= table_columns[2].text cocaine_sale= re.sub(r'\[\d*\]',' ',cocaine_sale.rstrip()) cocaine_transport= table_columns[3].text cocaine_transport= re.sub(r'\[\d*\]',' ',cocaine_transport.rstrip()) cocaine_cultivation= table_columns[4].text cocaine_cultivation= re.sub(r'\[\d*\]',' ',cocaine_cultivation.rstrip()) country_iso = find_iso_of_country(country_name) cocaine_info = { "name":country_name, "iso": country_iso, "cocaine-possession": cocaine_possession, "cocaine-sale": cocaine_sale, "cocaine-transport": cocaine_transport, "cocaine-cultivation": cocaine_cultivation } arrayCocaineInfo[country_iso] = cocaine_info return arrayCocaineInfo except Exception as error_msg: LOGGER.error(f'An error has occured while retrieving information for cocaine because of the following error: {error_msg}') finally: driver.close() driver.quit()
def get_url_of_countries(): info = {} try: #this is the link to the first page url = 'https://smartraveller.gov.au/countries/pages/list.aspx' LOGGER.info( 'Retrieving the URLs for all countries for the Australian advisory' ) # create a new chrome session driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') #patter of the link to the country page that the href should match reg = regex.compile(r'\/destinations\/\w+-*\w*\/\w+-*\w*') table = soup.find('table') table_body = table.find('tbody') table_rows = table_body.find_all('tr') for tr in table_rows: cols = tr.find_all('td') cols = [ele.text.strip() for ele in cols] if (cols[2] == ''): cols[2] = 'No advisory from the australian government' name = cols[0] advisory_text = cols[2] a = tr.find('a', attrs={'href': reg}) if (a != None): href = a['href'] info[name] = {"href": href, "advisory-text": advisory_text} LOGGER.success(f'Retrieved URL for {name}') LOGGER.success( 'Successfully retrieved the URLs for all countries of the Australian advisory' ) except: LOGGER.error( 'An error has occured while retrieving the URLs for all countries for the Australian advisory' ) finally: quit_driver(driver) return info
def all_unsafe_areas(): url = get_url_of_countries( ) #this function create its own driver -- to change data = {} driver = create_driver() LOGGER.info('Retrieving all unsafe areas') for country in url: href = url[country].get('href') link = "https://smartraveller.gov.au{}".format(href, sep='') unsafe_areas = regional_advice_level(driver, link) data[country] = {'unsafe_areas': unsafe_areas} LOGGER.info(f'{data[country]}') data = find_all_iso(data) driver.quit() #saving the data in json file with open('unsafe-areas-au.json', 'w') as fp: json.dump(data, fp)
def get_all_links(): LOGGER.info('Retrieving the URLs for all countries for unsafe areas') iso_list = config.iso_list data = {} #home page link home = 'https://travel.gc.ca/travelling/advisories' driver = create_driver() driver.get(home) try: soup = BeautifulSoup(driver.page_source, 'lxml') table = soup.find('table', attrs={'id': 'reportlist'}) tbody = table.find('tbody') rows = tbody.findAll('tr') #parse the table get the link in the <a> tag for row in rows: col1 = row.find('a') name = col1.text href = col1['href'] #the iso function accepts a dictionary with a key as name if (name == "Canary Islands"): data[name] = {'href': href, 'country-iso': 'CI'} elif (name == "Saint Vincent & the Grenadines"): name = "Saint Vincent and the Grenadines" elif (name == "Virgin Islands (U.S.)"): name = "United States Virgin Islands" data[name] = {'href': href} LOGGER.success(f'Retrieved the URL for {name}') LOGGER.success('Retrieved all the URLs for unsafe areas') except Exception as error_msg: LOGGER.error( f'An error has occured while retrieving the URLs for all countries from the canadian travel website because of the following error: {error_msg}' ) finally: quit_driver(driver) data = find_all_iso(data) return data
def save_to_unsafe_areas(): driver = create_driver() all_countries = get_all_links() data = {} for country in all_countries: name = all_countries[country] href = name['href'] url = "https://travel.gc.ca" + href regional_advisory = get_regional_advisories(url, driver) data[country] = {'unsafe_areas': regional_advisory} #canada special case data['Canada'] = { 'unsafe_areas': 'There is no regional advisory, take security precautions based on the general advisory for this country.' } data = find_all_iso(data) save_regional_advisories(data) quit_driver(driver)
def get_name_and_advisory_of_countries(): try: #this is the link to the first page url = 'https://travel.state.gov/content/travel/en/traveladvisories/traveladvisories.html/' LOGGER.info("Retrieving URL of all countries for United States") #set up the headless chrome driver driver = create_driver() driver.get(url) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') #pattern of the link to the country page that the href should match table = soup.find('table') table_body = table.find('tbody') table_rows = table_body.find_all('tr') counter = 0 info = {} for tr in table_rows: if (counter != 0): cols = tr.find_all('td') href = cols[0].find('a').get( 'href' ) # gets url for each country that is needed for additional advisory info link = "https://travel.state.gov/{}".format(href, sep='') cols = [ele.text.strip() for ele in cols] nameLength = len(cols[0]) - 16 name = cols[0][0:nameLength] if (name != 'W'): advisory = cols[1] advisory += '</br>' + parse_a_country_additional_advisory_info( link, driver) info[name] = advisory counter += 1 finally: driver.close() driver.quit() return info
def get_countries_canabaislaw(): LOGGER.info("Retrieving information for canabais") try: # this is the link to the first page url = 'https://en.wikipedia.org/wiki/Legality_of_cannabis' driver = create_driver() driver.get(url) # Selenium hands the page source to Beautiful Soup soup=BeautifulSoup(driver.page_source, 'html.parser') # patter of the link to the country page that the href should match table = soup.find('table', {'class':"wikitable"}) tbody = table.find('tbody') table_rows = tbody.find_all('tr') canabais_info= {} arrayCanabaisInfo = {} for tablerow in table_rows: table_columns = tablerow.find_all('td') if(len(table_columns)>0): country_name= table_columns[0].text recreational= table_columns[1].text recreational= re.sub(r'\[\d*\]',' ',recreational.rstrip()) medical= table_columns[2].text medical= re.sub(r'\[\d*\]',' ',medical.rstrip()) country_iso = find_iso_of_country(country_name) canabais_info = { "name":country_name, "iso": country_iso, "canabais-recreational": recreational, "canabais-medical": medical } arrayCanabaisInfo[country_iso] = canabais_info return arrayCanabaisInfo except Exception as error_msg: LOGGER.error(f'An error has occured while retrieving information for cannabis because of the followin error: {error_msg}') finally: driver.close() driver.quit()
def translateTest(): #We are parsing the sentences from google translae url = 'https://translate.google.com/?sl=en&tl=#view=home&op=translate&sl=en&tl=fr&text=thank%20you' try: driver = create_driver() driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') except: LOGGER.error("Could not connet to google translate.") try: translation = soup.find('span', { 'class': 'tlid-translation translation' }).text except: LOGGER.info( "Data is missing for Thank You, 'en' to 'fr' and will be replace by '-'" ) translation = "-" return translation
def get_url_of_countries(): info = {} LOGGER.info('Retrieving URL of all countries for Vaccines table') try: #this is the link to the first page driver = create_driver() driver.get(vaccine_url) #Selenium hands the page source to Beautiful Soup soup = BeautifulSoup(driver.page_source, 'lxml') #patter of the link to the country page that the href should match countries_per_letter_array = soup.find_all("ul", {"class": "list-bullet"}) for countries_per_letter in countries_per_letter_array: # print(countries_div) countries_given_letter_array = countries_per_letter.find_all('a') #retrieving links for all countries for country in countries_given_letter_array: country_name = country.text country_iso = find_iso_of_country(country_name) if ( country_iso != "" ): #Countries that don't have iso are not official counntries href = country['href'] info[country_iso] = {"href": href} LOGGER.info(f' Retrieving URL of {country_name}, {href}') except Exception as error_msg: LOGGER.error( f'Could not retrieve URLs of countries because of the following error: {error_msg}' ) finally: driver.close() driver.quit() return info
def get_additional_advisory_info_url(): url = 'https://travel.gc.ca/travelling/advisories' #set up the headless chrome driver driver = create_driver() driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') table = soup.find('table') table_body = table.find('tbody') table_rows = table_body.find_all('tr', attrs={'class': 'gradeX'}) additional_advisory = {} for row in table_rows: cols = row.find_all('td') country = cols[0].text iso = find_iso_of_country(country) advisory = cols[2].text additional_advisory[iso] = { 'country_name': country, 'advisory_text': advisory } quit_driver(driver) return additional_advisory
def save_to_SG(): LOGGER.info(f'Saving Singapore into the databse') driver = create_driver() LOGGER.info( 'Parsing visa requirments for all countries into the Singapore table') try: wiki_visa_url = wiki_visa_url_SG wiki_visa_ob = wiki_visa_parser(wiki_visa_url, driver) visas = wiki_visa_ob.visa_parser_table() LOGGER.success( 'Visa requirements have been succesfully parsed for the Singapore table' ) except Exception as error_msg: LOGGER.error( f'An error has occured whilse parsing for visa requirements because of the following error: {error_msg}' ) advisories = parse_all_countries_advisories() array_info = [] # create an an sqlite_advisory object db = Database("countries.sqlite") db.drop_table("SG") db.add_table("SG", country_iso="text", name="text", advisory_text="text", visa_info="text") array_info = save_info(db, visas, advisories, array_info) db.close_connection() LOGGER.success(f'Singapore was sucesfully saved to the database') quit_driver(driver) with open('./advisory-sg.json', 'w') as outfile: json.dump(array_info, outfile)
def save_to_central_america(): LOGGER.info("Begin parsing and saving for Central America...") #create driver driver = create_driver() #Mexico data_MX = mexico_all_links(driver) LOGGER.info("Saving Mexico to Central America") try: save_into_db_MX('MX', data_MX) LOGGER.success("MX successfully saved into the databse") except Exception as error_msg: LOGGER.error( f'MX was not successfully saved into the database because of the following error: {error_msg}' ) #create obj driver and set belize as first url driver = create_driver() LOGGER.info(f'Beginning parsing for Belize') try: wiki_visa = wiki_visa_parser(wiki_visa_url_BZ, driver) visa_BZ = wiki_visa.visa_parser_table() visa_BZ = replace_key_by_iso(visa_BZ) LOGGER.success(f'Following data was retrieved: {visa_BZ}') except Exception as error_msg: LOGGER.error( f'An error has occured while parsing for Belize because of the following error: {error_msg}' ) #Dominica driver.close() driver = create_driver() LOGGER.info(f'Beginning parsing for Dominica') try: wiki_visa = wiki_visa_parser(wiki_visa_url_DM, driver) visa_DM = wiki_visa.visa_parser_table() visa_DM = replace_key_by_iso(visa_DM) LOGGER.success(f'Following data was retrieved: {visa_DM}') except Exception as error_msg: LOGGER.error( f'An error has occured while parsing for Dominica because of the following error: {error_msg}' ) #Dominican Republic driver.close() driver = create_driver() LOGGER.info(f'Beginning parsing for Dominican Republic') try: wiki_visa = wiki_visa_parser(wiki_visa_url_DO, driver) visa_DO = wiki_visa.visa_parser_table() visa_DO = replace_key_by_iso(visa_DO) LOGGER.success(f'Following data was retrieved: {visa_DO}') except Exception as error_msg: LOGGER.error( f'An error has occured while parsing for Dominican Republic because of the following error" {error_msg}' ) #Panama driver.close() driver = create_driver() LOGGER.info(f'Beginning parsing for Panama') try: wiki_visa = wiki_visa_parser(wiki_visa_url_PA, driver) visa_PA = wiki_visa.visa_parser_table() visa_PA = replace_key_by_iso(visa_PA) LOGGER.success(f'Following data was retrieved: {visa_PA}') except Exception as error_msg: LOGGER.error( f'An error has occured while parsing for Panama because of the following error: {error_msg}' ) driver.quit() #save the data into the DB save_into_db("BZ", visa_BZ) save_into_db("DM", visa_DM) save_into_db("DO", visa_DO) save_into_db("PA", visa_PA)
save_into_db(data) def all_unsafe_areas(): url = get_url_of_countries( ) #this function create its own driver -- to change data = {} driver = create_driver() LOGGER.info('Retrieving all unsafe areas') for country in url: href = url[country].get('href') link = "https://smartraveller.gov.au{}".format(href, sep='') unsafe_areas = regional_advice_level(driver, link) data[country] = {'unsafe_areas': unsafe_areas} LOGGER.info(f'{data[country]}') data = find_all_iso(data) driver.quit() #saving the data in json file with open('unsafe-areas-au.json', 'w') as fp: json.dump(data, fp) # save_to_australia() driver = create_driver() data = regional_advice_level( driver, "https://www.smartraveller.gov.au/destinations/africa/mali") quit_driver(driver) #save_to_australia()