def get_nearby_places_for_site(national_site):
    coordinates = get_site_coordinates(national_site)
    if coordinates[0] == 0:
        print(">>> UNABLE TO RETRIEVE NEARBY PLACES")
        return None
    latitude = str(coordinates[0])
    longitude = str(coordinates[1])
    location = latitude + "," + longitude
    site = "GOOGLE"
    national_site = national_site
    topic = "nearby " + national_site
    cache = Cache(cache_file)
    base2 = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?"
    params_d2 = {}
    params_d2["key"] = google_places_key
    params_d2["location"] = location
    params_d2["radius"] = 10000
    UID = create_id(site, topic)
    nearby_response = cache.get(UID)
    if nearby_response == None:
        nearby_response = requests.get(base2, params_d2).text
        testurl = requests.get(base2, params_d2).url
        #print(testurl)
        #response = nearby_response.json()
        cache.set(UID, nearby_response)
    responses = json.loads(nearby_response)
    responses = responses["results"]
    NearbyList = []
    for i in responses:
        name = i["name"]
        latitude = i["geometry"]["location"]["lat"]
        longitude = i["geometry"]["location"]["lng"]
        place = NearbyPlace(name, latitude, longitude)
        NearbyList.append(place)
    return NearbyList
Exemple #2
0
def get_nearby_places_for_site(national_site):
    cache_file = "part2_nearbysearch.json"
    cache = Cache(cache_file)

    base = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?"
    params_diction = {}
    params_diction["key"] = google_places_key
    params_diction["location"] = get_location_for_site(national_site)
    params_diction["radius"] = 10000

    identifier = base + params_diction["key"] + params_diction[
        "location"] + str(params_diction["radius"])

    response = cache.get(identifier)
    while response is None:
        response = json.loads(requests.get(base, params_diction).text)
        cache.set(identifier, response, 10)

    nearby_result_list = response["results"]

    nearby_list = []

    for nearby in nearby_result_list:
        name = nearby["name"]
        lat = nearby["geometry"]["location"]["lat"]
        lng = nearby["geometry"]["location"]["lng"]
        nearby_list.append(NearbyPlace(name, lat, lng))

    return nearby_list
def get_site_coordinates(national_site):
    site = "GOOGLE"
    topic = national_site
    cache = Cache(cache_file)
    base1 = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json?"
    params_d = {}
    params_d["key"] = google_places_key
    params_d["input"] = national_site
    params_d["inputtype"] = "textquery"
    params_d['fields'] = 'geometry,formatted_address'
    # params_d["locationbias"] = "point:lat,lng"
    UID = create_id(site, topic)
    get_data = cache.get(UID)
    if get_data == None:
        get_data = requests.get(base1, params_d).text
        #testurl = requests.get(base1, params_d).url
        #print(testurl)
        cache.set(UID, get_data)
    lat = 0
    long = 0
    site_data = json.loads(get_data)
    try:
        place = site_data['candidates'][0]
        latitude = place['geometry']['location']['lat']
        longitude = place['geometry']['location']['lng']
        site_coordinates = latitude, longitude
    except:
        site_coordinates = lat, long
        print(
            "Sorry! There was an error retrieving coordinates for {}. We will not be able to list its nearby places or map it."
            .format(national_site))
    return site_coordinates
def website_scraping_and_cache(url):
    cache = Cache("national_sites.json")
    result = cache.get(url)

    if not result:
        result = requests.get(url).text
        cache.set(url, result, 30)

    return BeautifulSoup(result, 'html.parser')
def get_sites_for_state(state_abbr):
    state = state_abbr
    topic = state
    cache = Cache(cache_file)
    base = project_dictionary[state]
    UID = create_id(site, topic)
    state_response = cache.get(UID)
    if state_response == None:
        state_response = requests.get(base).text
        cache.set(UID, state_response)
    NationalSiteList = state_process(state_response)
    return NationalSiteList
def get_nearby_places_for_site(national_site):
    nearby_places_list = []  # the result list that stores the nearby places

    lng, lat = get_geolocation_info(national_site)
    national_site.lat = lat
    national_site.lng = lng

    if lng == None and lat == None:
        print("There is no geolocation info for " + str(national_site) + ".")
    else:
        params_dic = {
            "key": google_places_key,
            "location": str(lat) + "," + str(lng),
            "radius": 10000
        }
        unique_identifier = params_unique_combination(nearbysearch_base_url,
                                                      params_dic)
        cache = Cache("nearby_places.json")
        places_json = cache.get(unique_identifier)

        if not places_json:
            result = requests.get(nearbysearch_base_url, params=params_dic)
            places_json = json.loads(result.text)
            cache.set(unique_identifier, places_json, 30)

        try:
            places = places_json["results"]
            for place in places:
                place_class = NearbyPlace(place['name'])
                try:
                    place_class.lat = place['geometry']['location']['lat']
                    place_class.lng = place['geometry']['location']['lng']
                except:
                    pass
                nearby_places_list.append(place_class)
        except:
            pass

    return nearby_places_list
Exemple #7
0
def get_location_for_site(national_site):
    cache_file = "part2_textsearch.json"
    cache = Cache(cache_file)

    base = "https://maps.googleapis.com/maps/api/place/textsearch/json?"
    params_diction = {}
    params_diction["query"] = "{},{}".format(national_site.name,
                                             national_site.type)
    params_diction["key"] = google_places_key

    identifier = base + params_diction["query"] + params_diction["key"]

    response = cache.get(identifier)
    while response is None:
        response = json.loads(requests.get(base, params_diction).text)
        cache.set(identifier, response, 10)

    try:
        lat = str((response["results"][0]["geometry"]["location"]["lat"]))
        lng = str((response["results"][0]["geometry"]["location"]["lng"]))
        return lat + ',' + lng
    except:
        return None
def get_geolocation_info(national_site):
    params_dic = {
        "key": google_places_key,
        "address": national_site.name + " " + national_site.type
    }
    unique_identifier = params_unique_combination(geolocation_base_url,
                                                  params_dic)
    cache = Cache("geolocation_info.json")
    geolocation_json = cache.get(unique_identifier)

    if not geolocation_json:
        result = requests.get(geolocation_base_url, params=params_dic)
        geolocation_json = json.loads(result.text)
        cache.set(unique_identifier, geolocation_json, 30)

    try:
        geolocation = geolocation_json["results"][0]['geometry']['location']
        lng = geolocation['lng']
        lat = geolocation['lat']
    except:
        lng = None
        lat = None

    return lng, lat
from bs4 import BeautifulSoup
from alternate_advanced_caching import Cache
import requests
from datetime import datetime
import json
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

#################################
# PART 1: SCRAPING DATA FROM TA #
#################################

CACHE_FNAME = "top_destination.json"
url_to_scrape = "https://www.tripadvisor.com/TravelersChoice-Destinations"
cache_html = Cache(CACHE_FNAME)

while cache_html.get(url_to_scrape) is None:
    data = requests.get(url_to_scrape)
    html_text = data.text
    cache_html.set(url_to_scrape, html_text, 10)
    print("DATA NOT IN CACHE, SCRAPING FROM URL NOW")

soup = BeautifulSoup(cache_html.get(url_to_scrape), features="html.parser")
mainnames = soup.find_all("div", class_="mainName")
destination_name_lst = []
for mainname in mainnames:
    name = mainname.find("a").text
    destination_name_lst.append(name)
city_name_lst = []
for items in destination_name_lst:
    links = soup.find_all('a')
    for t in links:
        if "state" in t.attrs['href']:
            project_dictionary[t.attrs['href']
                               [7:9]] = "https://www.nps.gov" + t.attrs["href"]
    return project_dictionary


#################################
#     CONFIG & RUN LIST SCRAPE     #
#################################

cache_file = "NPS.json"
site = "NPS"
topic = "states"
cache = Cache(cache_file)
base = "https://www.nps.gov/index.htm"
UID = create_id(site, topic)
response = cache.get(UID)

if response == None:
    response = requests.get(base).text
    cache.set(UID, response)

process(response)

#####################################
## NATIONAL SITE CLASS
#####################################

    def __str__(self):
        return self.name

    def lat(self):
        return self.location["lat"]

    def lng(self):
        return self.location["lng"]


lst_url = []

CACHE_FNAME = "sample_cache_national_site.json"
CACHE_FNAME_Google = "google_cache.json"
primary_cache = Cache(CACHE_FNAME)

secondry_cache = Cache(CACHE_FNAME_Google)


def check_cache(lst_url):

    for url in lst_url:
        if primary_cache.get(url) is None:
            data = requests.get(url)
            html_text = data.text
            primary_cache.set(url, html_text, 30)


def get_sites_for_state(state_abbr):
    base_url = str("https://www.nps.gov/" + "state/" + state_abbr +
def process(response):
    name_lst = []
    url_lst = []
    site_lst = []

    soup = BeautifulSoup(response, 'html.parser')

    national_site_container = soup.find_all(
        'div', class_='col-md-9 col-sm-9 col-xs-12 table-cell list_left')

    for container in national_site_container:
        # Name
        name = container.h3.text
        name_lst.append(name)
        # print(name)

        # Type
        type = container.h2.text
        # print(type)

        # Description
        process.desc = container.p.text
        # print(desc)

        # URL
        process.url = "https://www.nps.gov" + container.h3.a.get(
            'href') + "index.htm"
        url_lst.append(process.url)
        # print(url)

        # Look at each URL and scrape that page
        for urls in url_lst:
            cache_file = "nps_address.json"
            cache_address = Cache(cache_file)

            UID = create_id_sites(urls)
            response2 = cache_address.get(UID)
            if response2 == None:
                response2 = requests.get(urls).text
                cache_address.set(UID, response2, 100)

            soup2 = BeautifulSoup(response2, "html.parser")
            try:
                ## Address Street
                address_street_fndr = soup2.find(
                    attrs={"itemprop": "streetAddress"})
                process.address_street = address_street_fndr.text
                process.address_street = process.address_street.replace(
                    '\n', '')
                # print(process.address_street)

                ## Address City
                address_city_fndr = soup2.find(
                    attrs={"itemprop": "addressLocality"})
                process.address_city = address_city_fndr.text
                # print(process.address_city)

                ## Address State
                address_state_fndr = soup2.find(
                    attrs={"itemprop": "addressRegion"})
                process.address_state = address_state_fndr.text
                # print(process.address_state)

                ## Address ZIP
                address_zip_fndr = soup2.find(attrs={"itemprop": "postalCode"})
                process.address_zip = address_zip_fndr.text
                process.address_zip = process.address_zip.strip()
                # print(process.address_zip)
            except:  # If address is not found
                # print("No address found for {}".format(urls))
                process.address_street = "Not found"
                process.address_city = "Not found"
                process.address_state = "Not found"
                process.address_zip = "Not found"

        national_sites = NationalSite(
            type, name)  # Create a new NationalSite instance
        site_lst.append(national_sites
                        )  # Append each NationalSite instance to site_lst list
    return site_lst
                process.address_state = "Not found"
                process.address_zip = "Not found"

        national_sites = NationalSite(
            type, name)  # Create a new NationalSite instance
        site_lst.append(national_sites
                        )  # Append each NationalSite instance to site_lst list
    return site_lst


######################
#  GOOGLE PLACES API #
######################
CACHE_FILE1 = "google_places.json"
CACHE_FILE2 = "google_coordinates.json"
c = Cache(CACHE_FILE1)
c2 = Cache(CACHE_FILE2)


def params_unique_combination(baseurl, params_d, private_keys=["api_key"]):
    # HTTPS://MAPS.GOOGLEAPIS.COM/MAPS/API/PLACE/NEARBYSEARCH/JSON?LOCATION=44.778410, -117.827940&RADIUS=10000
    alphabetized_keys = OrderedDict(params_d)
    res = []
    for k in alphabetized_keys:
        if k not in private_keys:
            res.append("{}={}".format(k, params_d[k]))
    return baseurl + "&".join(res)


def google_coordinates(input,
                       inputtype="textquery",
Exemple #14
0
import requests
from alternate_advanced_caching import Cache
from Scrap import *
import emoji
import base64
import os
from Twitter_official import *







Cache_name_twitter = "Twitter_cache.json"
Twitter_cache = Cache(Cache_name_twitter)
lst_twitter_url = []



def init_twitter(client_key, client_secret):

<<<<<<< HEAD
=======

>>>>>>> e967cb005a2e6ffc8374909121c6301e10bac91d
    client_key = client_key
    client_secret = client_secret

    key_secret = '{}:{}'.format(client_key, client_secret).encode('ascii')
Exemple #15
0
def get_sites_for_state(state_abbr):
    cache_file = "part1.json"
    url_to_scrape = "https://www.nps.gov/state/{}/index.htm".format(state_abbr)
    cache = Cache(cache_file)

    while cache.get(url_to_scrape) is None:
        html_text = requests.get(url_to_scrape).text
        cache.set(url_to_scrape, html_text, 10)

    soup = BeautifulSoup(cache.get(url_to_scrape), features='html.parser')
    parks = soup.find(id="list_parks").find_all(class_='clearfix')

    ### Information you should get for each National Site will include the site name, site type, and the physical (or mailing) address.
    national_park_list = []
    for park in parks:
        site_name = park.find('h3').text
        site_type = park.find('h2').text
        site_desc = park.find('p').text

        address_url = park.find_all('a')[2].get('href')

        cache_file = "part1_address.json"
        url_to_scrape = address_url
        cache = Cache(cache_file)

        while cache.get(url_to_scrape) is None:
            html_text = requests.get(url_to_scrape).text
            cache.set(url_to_scrape, html_text, 10)

        soup_add = BeautifulSoup(cache.get(url_to_scrape),
                                 features='html.parser')

        address_street = soup_add.find(itemprop='streetAddress').text
        address_city = soup_add.find(itemprop='addressLocality').text
        address_state = soup_add.find(itemprop='addressRegion').text
        address_zip = soup_add.find(itemprop='postalCode').text

        national_park_list.append(
            NationalSite(site_type, site_name, site_desc, address_street,
                         address_city, address_state, address_zip))
    return national_park_list
Exemple #16
0
from bs4 import BeautifulSoup as bs
from alternate_advanced_caching import Cache
from datetime import datetime
import json
import requests

lst_url = []
Cache_name = "Artists_cache.json"
artists_cache = Cache(Cache_name)


def check_cache(lst_url):

    for url in lst_url:
        if artists_cache.get(url) is None:
            data = requests.get(url)
            html_text = data.text
            artists_cache.set(url, html_text, 30)


def get_top_billboard(year, search_term):
    year = int(year)
    search_term = search_term.replace(" ", "-")
    primary_url = str("https://www.billboard.com/charts/year-end/{}/{}".format(
        year, search_term))
    lst_url.append(primary_url)
    check_cache(lst_url)
    soup = bs(artists_cache.get(primary_url), features="html.parser")
    name = [
        i.get_text().replace("\n", "")
        for i in soup.find_all(class_="ye-chart-item__title")