import sys, os, re
sys.path.append("lib")
import utils

import wikipedia
from bs4 import BeautifulSoup
import tldextract

# Load our airlines...
our_airlines = utils.read_json_lines_file('data/our_airlines.jsonl')

# Build a new list that includes wikipedia data
with_url = []
for airline in our_airlines:
  # Get the wikipedia page for the airline name
  wikipage = wikipedia.page(airline['Name'])

  # Get the summary
  summary = wikipage.summary
  airline['summary'] = summary

  # Get the HTML of the page
  page = BeautifulSoup(wikipage.html())

  # Task: get the logo from the right 'vcard' column
  # 1) Get the vcard table
  vcard_table = page.find_all('table', class_='vcard')[0]
  # 2) The logo is always the first image inside this table
  first_image = vcard_table.find_all('img')[0]
  # 3) Set the url to the image
  logo_url = 'http:' + first_image.get('src')
Beispiel #2
0
#

import sys, os, re
sys.path.append("lib")
import utils

import numpy as np
import sklearn
import iso8601
import datetime
print("Imports loaded...")

# Load and check the size of our training data. May take a minute.
print("Original JSON file size: {:,} Bytes".format(
    os.path.getsize("../data/simple_flight_delay_features.jsonl")))
training_data = utils.read_json_lines_file(
    '../data/simple_flight_delay_features.jsonl')
print("Training items: {:,}".format(len(training_data)))  # 5,714,008
print("Data loaded...")

# Inspect a record before we alter them
print("Size of training data in RAM: {:,} Bytes".format(
    sys.getsizeof(training_data)))  # 50MB
print(training_data[0])

# # Sample down our training data at first...
# sampled_training_data = training_data#np.random.choice(training_data, 1000000)
# print("Sampled items: {:,} Bytes".format(len(training_data)))
# print("Data sampled...")

# Separate our results from the rest of the data, vectorize and size up
results = [record['ArrDelay'] for record in training_data]
Beispiel #3
0
import sys, os, re
sys.path.append("lib")
import utils

import wikipedia
from bs4 import BeautifulSoup
import tldextract

# Load our airlines...
our_airlines = utils.read_json_lines_file('data/our_airlines.jsonl')

# Build a new list that includes wikipedia data
with_url = []
for airline in our_airlines:
    # Get the wikipedia page for the airline name
    wikipage = wikipedia.page(airline['Name'])

    # Get the summary
    summary = wikipage.summary
    airline['summary'] = summary

    # Get the HTML of the page
    page = BeautifulSoup(wikipage.html())

    # Task: get the logo from the right 'vcard' column
    # 1) Get the vcard table
    vcard_table = page.find_all('table', class_='vcard')[0]
    # 2) The logo is always the first image inside this table
    first_image = vcard_table.find_all('img')[0]
    # 3) Set the url to the image
    logo_url = 'http:' + first_image.get('src')
import sys, os, re
sys.path.append("lib")
import utils

import numpy as np
import sklearn
import iso8601
import datetime
print("Imports loaded...")

# Load and check the size of our training data. May take a minute.
print("Original JSON file size: {:,} Bytes".format(os.path.getsize("data/simple_flight_delay_features.jsonl")))
training_data = utils.read_json_lines_file('data/simple_flight_delay_features.jsonl')
print("Training items: {:,}".format(len(training_data))) # 5,714,008
print("Data loaded...")

# Inspect a record before we alter them
print("Size of training data in RAM: {:,} Bytes".format(sys.getsizeof(training_data))) # 50MB
print(training_data[0])

# # Sample down our training data at first...
sampled_training_data = training_data#np.random.choice(training_data, 1000000)
print("Sampled items: {:,} Bytes".format(len(training_data)))
print("Data sampled...")

# Separate our results from the rest of the data, vectorize and size up
results = [record['ArrDelay'] for record in sampled_training_data]
results_vector = np.array(results)
sys.getsizeof(results_vector) # 45,712,160 Bytes
print("Results vectorized...")
import sys, os, re
import time

sys.path.append("lib")
import utils

import requests
from bs4 import BeautifulSoup

tail_number_records = utils.read_json_lines_file('data/tail_numbers.jsonl')

aircraft_records = []
# Loop through the tail numbers, fetching
for tail_number_record in tail_number_records:
  time.sleep(0.1) # essential to sleep FIRST in loop or you will flood sites
  
  # Parameterize the URL with the tail number
  BASE_URL = 'http://registry.faa.gov/aircraftinquiry/NNum_Results.aspx?NNumbertxt={}'
  tail_number = tail_number_record['TailNum']
  url = BASE_URL.format(tail_number)

  # Fetch the page, parse the HTML
  r = requests.get(url)
  
  html = r.text
  soup = BeautifulSoup(html)
  
  # The table structure is constant for all pages that contain data
  try:
    aircraft_description = soup.find_all('table')[4]
    craft_tds = aircraft_description.find_all('td')