import sys, os, re sys.path.append("lib") import utils import wikipedia from bs4 import BeautifulSoup import tldextract # Load our airlines... our_airlines = utils.read_json_lines_file('data/our_airlines.jsonl') # Build a new list that includes wikipedia data with_url = [] for airline in our_airlines: # Get the wikipedia page for the airline name wikipage = wikipedia.page(airline['Name']) # Get the summary summary = wikipage.summary airline['summary'] = summary # Get the HTML of the page page = BeautifulSoup(wikipage.html()) # Task: get the logo from the right 'vcard' column # 1) Get the vcard table vcard_table = page.find_all('table', class_='vcard')[0] # 2) The logo is always the first image inside this table first_image = vcard_table.find_all('img')[0] # 3) Set the url to the image logo_url = 'http:' + first_image.get('src')
# import sys, os, re sys.path.append("lib") import utils import numpy as np import sklearn import iso8601 import datetime print("Imports loaded...") # Load and check the size of our training data. May take a minute. print("Original JSON file size: {:,} Bytes".format( os.path.getsize("../data/simple_flight_delay_features.jsonl"))) training_data = utils.read_json_lines_file( '../data/simple_flight_delay_features.jsonl') print("Training items: {:,}".format(len(training_data))) # 5,714,008 print("Data loaded...") # Inspect a record before we alter them print("Size of training data in RAM: {:,} Bytes".format( sys.getsizeof(training_data))) # 50MB print(training_data[0]) # # Sample down our training data at first... # sampled_training_data = training_data#np.random.choice(training_data, 1000000) # print("Sampled items: {:,} Bytes".format(len(training_data))) # print("Data sampled...") # Separate our results from the rest of the data, vectorize and size up results = [record['ArrDelay'] for record in training_data]
import sys, os, re sys.path.append("lib") import utils import numpy as np import sklearn import iso8601 import datetime print("Imports loaded...") # Load and check the size of our training data. May take a minute. print("Original JSON file size: {:,} Bytes".format(os.path.getsize("data/simple_flight_delay_features.jsonl"))) training_data = utils.read_json_lines_file('data/simple_flight_delay_features.jsonl') print("Training items: {:,}".format(len(training_data))) # 5,714,008 print("Data loaded...") # Inspect a record before we alter them print("Size of training data in RAM: {:,} Bytes".format(sys.getsizeof(training_data))) # 50MB print(training_data[0]) # # Sample down our training data at first... sampled_training_data = training_data#np.random.choice(training_data, 1000000) print("Sampled items: {:,} Bytes".format(len(training_data))) print("Data sampled...") # Separate our results from the rest of the data, vectorize and size up results = [record['ArrDelay'] for record in sampled_training_data] results_vector = np.array(results) sys.getsizeof(results_vector) # 45,712,160 Bytes print("Results vectorized...")
import sys, os, re import time sys.path.append("lib") import utils import requests from bs4 import BeautifulSoup tail_number_records = utils.read_json_lines_file('data/tail_numbers.jsonl') aircraft_records = [] # Loop through the tail numbers, fetching for tail_number_record in tail_number_records: time.sleep(0.1) # essential to sleep FIRST in loop or you will flood sites # Parameterize the URL with the tail number BASE_URL = 'http://registry.faa.gov/aircraftinquiry/NNum_Results.aspx?NNumbertxt={}' tail_number = tail_number_record['TailNum'] url = BASE_URL.format(tail_number) # Fetch the page, parse the HTML r = requests.get(url) html = r.text soup = BeautifulSoup(html) # The table structure is constant for all pages that contain data try: aircraft_description = soup.find_all('table')[4] craft_tds = aircraft_description.find_all('td')