def __init__(self, entity, ok_path, project, wandb_path="./wandb"): self.grader = Notebook(ok_path) wandb.init(entity=entity, project=project, dir=wandb_path, anonymous="must") self.test_map = self.grader.assignment.test_map self.pass_dict = {k: 0 for k in self.test_map} self.log()
def test_nb_grade_simple_valid(): """ Test parsing & running a simple oktest file. """ here = os.path.dirname(__file__) nb = Notebook(os.path.join(here, 'oktests/simple.ok')) nb.grade('simple_valid')
def __client__(assignment_type, force_auth): ok = Notebook(assignment_type) if force_auth: ok.auth(force=True) elif not _os.path.exists( _os.path.join(_os.environ.get("HOME"), ".config/ok/auth_refresh")): ok.auth(force=True) else: ok.auth(inline=True) return ok
def push(OK): with open('ok.py', 'wt') as f: json.dump(OK, f) from client.api.notebook import Notebook ok = Notebook('ok.py') ok.auth(inline=True) ok.submit()
class WandbTrackedOK(object): def __init__(self, entity, path, project): self.grader = Notebook(path) wandb.init(entity=entity, project=project, anonymous="must") self.test_map = self.grader.assignment.test_map self.pass_dict = {k: 0 for k in self.test_map} self.log() def grade(self, question, *args, **kwargs): result = self.grader.grade(question, *args, **kwargs) self.pass_dict[question] = result["passed"] self.log() def log(self): total = sum([v for v in self.pass_dict.values()]) wandb.log({"passes": self.pass_dict, "total": total}) def __delete__(self): wandb.join()
# # # In[ ]: #This just gets the data and preps the environment. os.system( 'wget $"https://github.com/jkuruzovich/final-starter-2019/raw/master/tests.zip" && unzip -o tests.zip ' '&& wget "https://github.com/jkuruzovich/final-starter-2019/blob/master/final.ok"' ) # In[ ]: #get_ipython().run_cell_magic('capture', ' ', '#This capture command supresses output. \n\n#***********\n#Add the manually graded ones\nq4=5\ncomments=""\npoints_per_test=2.5\n\n#***********\n\n\nfrom client.api.notebook import Notebook\nok = Notebook(\'final.ok\')\n_ = ok.auth(inline=False)\nfrom bs4 import BeautifulSoup\nimport pandas as pd\nimport os\n\n#Grade Results\nresults= {q[:-3]:ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith(\'q\')}\n\n#If running locally with lots of notebooks load the grades. \ndf = pd.DataFrame()\nrow=df.shape[0]\ndf.loc[row,\'student\']=name #This is set in the last.\ndf.loc[row,\'rcsid\']=rcsid #This is set in the last. \ntotal_grade=0\n#This loops through the results\nfor key, val in results.items(): \n df.loc[row,key]=val.grade\n results_key=str(key)+"-failed"\n df.loc[row,key]=val.grade*points_per_test\n #We use beautiful soup to parse the tests. \n soup = BeautifulSoup(str(val.failed_tests), "lxml")\n #There are multiple components, but the expected data seems most valuable. \n got = soup.get_text().split(\'\\\\n\')[16:20]\n df.loc[row,results_key]=str(got)\n total_grade+=df.loc[row,key] #total grade\ndf.loc[row, \'q4\']=q4\ntotal_grade+=q4\ndf.loc[row,\'total_grade\']=total_grade\ndf.loc[row,\'comments\']=comments\n\nif not os.path.isfile(\'grades.csv\'):\n df.to_csv(\'grades.csv\', index=False)\nelse: # else it exists so append without writing the header\n df.to_csv(\'grades.csv\', mode=\'a\', header=False,index=False)\n') import grade from client.api.notebook import Notebook ok = Notebook('final.ok') _ = ok.auth(inline=True) name = "final" points_per_test = 2.5 comments = "" grade.grade(name, points_per_test, comments, ok) # In[ ]: #get_ipython().system('cat grades')
#!/usr/bin/env python # coding: utf-8 # In[1]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('proj3.ok') # # Project 3: Predicting Taxi Ride Duration # ## Due Date: Thursday 5/2/19, 11:59PM # # **Collaboration Policy** # # Data science is a collaborative activity. While you may talk with others about # the project, we ask that you **write your solutions individually**. If you do # discuss the assignments with others please **include their names** at the top # of your notebook. # **Collaborators**: *list collaborators here* # ## Score Breakdown # Question | Points # --- | --- # 1a | 2 # 1b | 2 # 1c | 3 # 1d | 2 # 2a | 1 # 2b | 2 # 3a | 2
from datascience import * import matplotlib.pyplot as plots plots.style.use('fivethirtyeight') import numpy as np from client.api.notebook import Notebook ok = Notebook('project01.ok') _ = ok.auth(inline=True)
#!/usr/bin/env python # coding: utf-8 # In[1]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('proj2.ok') # # Project 2: Spam/Ham Classification # ## Feature Engineering, Logistic Regression, Cross Validation # ## Due Date: Sunday 11/24/19, 11:59PM # # **Collaboration Policy** # # Data science is a collaborative activity. While you may talk with others about # the project, we ask that you **write your solutions individually**. If you do # discuss the assignments with others please **include their names** at the top # of your notebook. # **Collaborators**: *list collaborators here* # ## This Assignment # In this project, you will use what you've learned in class to create a classifier that can distinguish spam (junk or commercial or bulk) emails from ham (non-spam) emails. In addition to providing some skeleton code to fill in, we will evaluate your work based on your model's accuracy and your written responses in this notebook. # # After this project, you should feel comfortable with the following: # # - Feature engineering with text data # - Using sklearn libraries to process data and fit models # - Validating the performance of your model and minimizing overfitting
import numpy as np from datascience import * # These lines set up graphing capabilities. import matplotlib get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') import warnings warnings.simplefilter('ignore', FutureWarning) from ipywidgets import interact, interactive, fixed, interact_manual import ipywidgets as widgets from client.api.notebook import Notebook ok = Notebook('lab04.ok') # ## 1. Functions and CEO Incomes # # Let's start with a real data analysis task. We'll look at the 2015 compensation of CEOs at the 100 largest companies in California. The data were compiled for a Los Angeles Times analysis [here](http://spreadsheets.latimes.com/california-ceo-compensation/), and ultimately came from [filings](https://www.sec.gov/answers/proxyhtf.htm) mandated by the SEC from all publicly-traded companies. Two companies have two CEOs, so there are 102 CEOs in the dataset. # # We've copied the data in raw form from the LA Times page into a file called `raw_compensation.csv`. (The page notes that all dollar amounts are in millions of dollars.) # In[ ]: raw_compensation = Table.read_table('raw_compensation.csv') raw_compensation
# # You **never** have to use just one line in this project or any others. Use intermediate variables and multiple lines as much as you would like! # # To get started, load `datascience`, `numpy`, `plots`, and `ok`. # In[1]: from datascience import * import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plots plots.style.use('fivethirtyeight') from client.api.notebook import Notebook ok = Notebook('project1.ok') # Before continuing the assignment, select "Save and Checkpoint" in the File menu and then execute the submit cell below. The result will contain a link that you can use to check that your assignment has been submitted successfully. If you submit more than once before the deadline, we will only grade your final submission. If you mistakenly submit the wrong one, you can head to okpy.org and flag the correct version. There will be another submit cell at the end of the assignment when you finish! # In[2]: _ = ok.submit() # ## 1. Global Population Growth # # The global population of humans reached 1 billion around 1800, 3 billion around 1960, and 7 billion around 2011. The potential impact of exponential population growth has concerned scientists, economists, and politicians alike. # # The UN Population Division estimates that the world population will likely continue to grow throughout the 21st century, but at a slower rate, perhaps reaching 11 billion by 2100. However, the UN does not rule out scenarios of more extreme growth. # # <a href="http://www.pewresearch.org/fact-tank/2015/06/08/scientists-more-worried-than-public-about-worlds-growing-population/ft_15-06-04_popcount/">
from client.api.notebook import Notebook from client.api import assignment from client.utils import auth args = assignment.Settings( server='clewolffautook21.eastus.cloudapp.azure.com/okpy') ok = Notebook('./lecture.ok', args)
#!/usr/bin/env python # coding: utf-8 # In[1]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('proj1b.ok') # # Project 1 Part B # # ## Due Date: Monday, Feb 24th, 11:59PM # # First we import the relevant libraries for this project. # In[2]: import pickle import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns sns.set() plt.style.use('fivethirtyeight') # In the following cell, we will load the cleaned data from Part A of Project 1. Note that we will be using the relevant data files based on the staff solution. # In[47]:
# # Lab 1: Introduction to Python # # Welcome to Data Science 8.1X Foundations of Data Science: Computational Thinking with Python! Each week you will complete a lab assignment like this one. In this course, you will learn basics of computational thinking, an essential skill in today’s data-driven world, using the popular programming language, Python. You can't learn technical subjects without hands-on practice, so labs are an important part of the course. # # Please complete this notebook by filling in the cells provided. Before you begin, execute the following cell to load the provided autograder tests. If you pass all the autograder tests, you will receive full credit for the lab. # # In this lab, you'll get started with the Python programming language through numbers, names, and expressions. You'll also get a brief introduction to tables. # In[ ]: import numpy as np from datascience import * # These lines load the tests. from client.api.notebook import Notebook ok = Notebook('lab01.ok') # ## 1. Numbers # # Quantitative information arises everywhere in data science. In addition to representing commands to print out lines, expressions can represent numbers and methods of combining numbers. The expression `3.2500` evaluates to the number 3.25. (Run the cell and see.) # In[66]: 3.2500 # Notice that we didn't have to `print`. When you run a notebook cell, if the last line has a value, then Jupyter helpfully prints out that value for you. However, it won't print out prior lines automatically. # In[ ]: print(2) 3
#NEWCELL #Checks function. #def check_get_hashtags(file,hashtag,answer): # with open(file) as json_file: # statuses = json.load(json_file) # other_hashtags = get_hashtags(statuses, hashtag) # #print(other_hashtags) # other_hashtags = [s.replace('#', '') for s in other_hashtags] # if other_hashtags==answer: # return True # else: # return False #NEWCELL ok = Notebook(cf['ok_file']) _ = ok.auth(inline=False) results = { q[:-3]: ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q') } #NEWCELL import autograde as ag importlib.reload(ag) def output_tests(cf, results): autograde = {} autograde['github_id'] = cf['github_id'] #This is a selection of variables from config file.
# Welcome to Lab 2! # # Last time, we had our first look at Python and Jupyter notebooks. So far, we've only used Python to manipulate numbers. There's a lot more to life than numbers, so Python lets us represent many other types of data in programs. # # In this lab, you'll first see how to represent and manipulate another fundamental type of data: text. A piece of text is called a *string* in Python. # # You'll also see how to invoke *methods*. A method is very similar to a function. It just looks a little different because it's tied to a particular piece of data (like a piece of text or a number). # # Last, you'll see how to work with datasets in Python -- *collections* of data, like the numbers 2 through 5 or the words "welcome", "to", and "lab". # Initialize the OK tests to get started. # In[ ]: from client.api.notebook import Notebook ok = Notebook('lab02.ok') # # 1. Review: The building blocks of Python code # # The two building blocks of Python code are *expressions* and *statements*. An **expression** is a piece of code that # # * is self-contained, meaning it would make sense to write it on a line by itself, and # * usually has a value. # # # Here are two expressions that both evaluate to 3 # # 3 # 5 - 2 # # One important form of an expression is the **call expression**, which first names a function and then describes its arguments. The function returns some value, based on its arguments. Some important mathematical functions are
#!/usr/bin/env python # coding: utf-8 # In[2]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('hw5.ok') # # Homework 5: Predicting Housing Prices # # ## Due Date: 11:59pm Monday, March 30 # # ### Collaboration Policy # # Data science is a collaborative activity. While you may talk with others about the homework, we ask that you **write your solutions individually**. If you do discuss the assignments with others please **include their names** in the collaborators cell below. # **Collaborators:** *list names here* # ## Introduction # # In this homework, we will go through the iterative process of specifying, fitting, and analyzing the performance of a model. # # In the first portion of the assignment, we will guide you through some basic exploratory data analysis (EDA), laying out the thought process that leads to certain modeling decisions. Next, you will add a new feature to the dataset, before specifying and fitting a linear model to a few features of the housing data to predict housing prices. Finally, we will analyze the error of the model and brainstorm ways to improve the model's performance. # # After this homework, you should feel comfortable with the following: # # 1. Simple feature engineering # 1. Using sklearn to build linear models # 1. Building a data pipeline using pandas #
#!/usr/bin/env python # coding: utf-8 # In[310]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('project3.ok') # # Project 3: Movie Classification # Welcome to the third project of Data 8! You will build a classifier that guesses whether a movie is romance or action, using only the number of times words appear in the movies's screenplay. By the end of the project, you should know how to: # # 1. Build a k-nearest-neighbors classifier. # 2. Test a classifier on data. # # ### Logistics # # # **Deadline.** This project is due at 11:59pm on Friday 12/06. You can earn an early submission bonus point by submitting your completed project by 11:59 on Thursday 12/05. It's **much** better to be early than late, so start working now. # # **Checkpoint.** For full credit, you must also **complete Part 2 of the project (out of 4) and submit it by 11:59pm on Friday 11/22**. You will not have lab time to work on these questions, so we recommend that you start early on each part to stay on track. # # **Partners.** You may work with one other partner; this partner **must** be enrolled in the same lab section as you are. Only one of you is required to submit the project. On [okpy.org](http://okpy.org), the person who submits should also designate their partner so that both of you receive credit. # # **Rules.** Don't share your code with anybody but your partner. You are welcome to discuss questions with other students, but don't share the answers. The experience of solving the problems in this project will prepare you for exams (and life). If someone asks you for the answer, resist! Instead, you can demonstrate how you would solve a similar problem. # # **Support.** You are not alone! Come to office hours, post on Piazza, and talk to your classmates. If you want to ask about the details of your solution to a problem, make a private Piazza post and the staff will respond. If you're ever feeling overwhelmed or don't know how to make progress, email your TA or tutor for help. You can find contact information for the staff on the [course website](http://data8.org/fa18/staff.html). # # **Tests.** Passing the tests for a question **does not** mean that you answered the question correctly. Tests usually only check that your table has the correct column labels. However, more tests will be applied to verify the correctness of your submission in order to assign your final score, so be careful and check your work! #
# # Welcome to lab 3! # # This week, we'll learn about *tables*, which let us work with multiple arrays of data about the same things. Tables are described in [Chapter 6](http://www.inferentialthinking.com/chapters/06/tables.html) of the text. # # First, set up the tests and imports by running the cell below. # In[ ]: import numpy as np from datascience import * # These lines load the tests. from client.api.notebook import Notebook ok = Notebook('lab03.ok') # ## 1. Introduction # # For a collection of things in the world, an array is useful for describing a single attribute of each thing. For example, among the collection of US States, an array could describe the land area of each. Tables extend this idea by describing multiple attributes for each element of a collection. # # In most data science applications, we have data about many entities, but we also have several kinds of data about each entity. # # For example, in the cell below we have two arrays. The first one contains the world population in each year (as [estimated](http://www.census.gov/population/international/data/worldpop/table_population.php) by the US Census Bureau), and the second contains the years themselves (in order, so the first elements in the population and the years arrays correspond). # In[ ]: population_amounts = Table.read_table("world_population.csv").column( "Population") years = np.arange(1950, 2015 + 1) print("Population column:", population_amounts)
# Welcome to Lab 2! # # Last time, we had our first look at Python and Jupyter notebooks. So far, we've only used Python to manipulate numbers. There's a lot more to life than numbers, so Python lets us represent many other types of data in programs. # # In this lab, you'll first see how to represent and manipulate another fundamental type of data: text. A piece of text is called a *string* in Python. # # You'll also see how to invoke *methods*. A method is very similar to a function. Calling a method looks different because the method is tied to a particular piece of data. # # Last, you'll learn more about working with datasets in Python. # First, initialize the OK tests. Each time you come back to this site to work on the lab, you will need to run this cell again. # In[ ]: from client.api.notebook import Notebook ok = Notebook('lab02.ok') # ## 1. Review: The building blocks of Python code # # The two building blocks of Python code are *expressions* and *statements*. An **expression** is a piece of code that # # * is self-contained, meaning it would make sense to write it on a line by itself, and # * usually has a value. # # # Here are two expressions that both evaluate to 3 # # 3 # 5 - 2 # # One important form of an expression is the **call expression**, which first names a function and then describes its arguments. The function returns some value, based on its arguments. Some important mathematical functions are
#!/usr/bin/env python # coding: utf-8 # In[30]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('project3.ok') # # Project 3: Movie Classification # Welcome to the third project of Data 8! You will build a classifier that guesses whether a movie is a comedy or a thriller, using only the number of times words appear in the movies's screenplay. By the end of the project, you should know how to: # # 1. Build a k-nearest-neighbors classifier. # 2. Test a classifier on data. # # ### Logistics # # # **Deadline.** This project is due at 11:59pm on Friday 5/01. You can earn an early submission bonus point by submitting your completed project by 11:59 on Thursday 4/30. It's **much** better to be early than late, so start working now. # # **Checkpoint.** For full credit, you must also **complete Part 2 of the project (out of 4) and submit it by 11:59pm on Friday 4/24**. You will not have lab time to work on these questions, we recommend that you start early on each part to stay on track. # # **Partners.** You may work with one other partner; this partner **does not** need to be from the same lab. Only one of you is required to submit the project. On [okpy.org](http://okpy.org), the person who submits should also designate their partner so that both of you receive credit. # # **Rules.** Don't share your code with anybody but your partner. You are welcome to discuss questions with other students, but don't share the answers. The experience of solving the problems in this project will prepare you for exams (and life). If someone asks you for the answer, resist! Instead, you can demonstrate how you would solve a similar problem. # # **Support.** You are not alone! Come to office hours, post on Piazza, and talk to your classmates. If you want to ask about the details of your solution to a problem, make a private Piazza post and the staff will respond. If you're ever feeling overwhelmed or don't know how to make progress, email your TA or tutor for help. You can find contact information for the staff on the [course website](http://data8.org/sp20/staff.html). # # **Tests.** Passing the tests for a question **does not** mean that you answered the question correctly. Tests usually only check that your table has the correct column labels. However, more tests will be applied to verify the correctness of your submission in order to assign your final score, so be careful and check your work!
import numpy as np import math from datascience import * import matplotlib matplotlib.use('Agg', warn=False) get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plots plots.style.use('fivethirtyeight') import warnings warnings.simplefilter(action="ignore", category=FutureWarning) from client.api.notebook import Notebook ok = Notebook('project3.ok') _ = ok.auth(inline=True) # # 1. The Dataset # # In this project, we are exploring movie screenplays. We'll be trying to predict each movie's genre from the text of its screenplay. In particular, we have compiled a list of 5,000 words that might occur in the dialog of a movie. For each movie, our dataset tells us the frequency with which each of these words occurs in its screenplay. All words have been converted to lowercase. # # Run the cell below to read the `movies` table. # In[2]: movies = Table.read_table('movies.csv') movies.where("Title", "the matrix").select(0, 1, 2, 3, 4, 5, 10, 30, 5005)
#!/usr/bin/env python # coding: utf-8 # Initialize the OK tests to get started. # In[2]: from client.api.notebook import Notebook ok = Notebook('lab02.ok') _ = ok.auth(inline=True) # In[3]: import settings # **Submission**: This should be submitted in PDF format with Homework 2. # In[4]: a=5*13*31+2 b=2**5-2**11-2**1 b=2018 settings.new_year = max(a,b) settings.new_year
#!/usr/bin/env python # coding: utf-8 # In[26]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('project2.ok') # # Project 2: Diet and Disease # In this project, you will investigate the major causes of death in the world, as well as how one of these causes, heart disease, might be linked to diet! # ### Logistics # # # **Deadline.** This project is due at 11:59pm on Friday, 4/12. It's **much** better to be early than late, so start working now. # # **Checkpoint.** For full credit, you must also complete the questions up until the end of Part 2 and submit them by 11:59pm on Friday, 4/5. You will have some lab time to work on these questions, but we recommend that you start the project before lab and leave time to finish the checkpoint afterward. # # **Partners.** You may work with one other partner. Your partner must be enrolled in the same lab as you are. Only one of you is required to submit the project. On [okpy.org](http://okpy.org), the person who submits should also designate their partner so that both of you receive credit. # # **Rules.** Don't share your code with anybody but your partner. You are welcome to discuss questions with other students, but don't share the answers. The experience of solving the problems in this project will prepare you for exams (and life). If someone asks you for the answer, resist! Instead, you can demonstrate how you would solve a similar problem. # # **Support.** You are not alone! Come to office hours, post on Piazza, and talk to your classmates. If you want to ask about the details of your solution to a problem, make a private Piazza post and the staff will respond. If you're ever feeling overwhelmed or don't know how to make progress, email your TA or tutor for help. You can find contact information for the staff on the [course website](http://data8.org/sp19/staff.html). # # **Tests.** Passing the tests for a question **does not** mean that you answered the question correctly. Tests usually only check that your table has the correct column labels. However, more tests will be applied to verify the correctness of your submission in order to assign your final score, so be careful and check your work! # # **Advice.** Develop your answers incrementally. To perform a complicated table manipulation, break it up into steps, perform each step on a different line, give a new name to each result, and check that each intermediate result is what you expect. You can add any additional names or functions you want to the provided cells. # # All of the concepts necessary for this project are found in the textbook. If you are stuck on a particular problem, reading through the relevant textbook section often will help clarify the concept.
#!/usr/bin/env python # coding: utf-8 # In[1]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('hw6.ok') # # Homework 6: Predicting Housing Prices (Continued) # # ## Due Date: 11:59pm Monday, April 6 # # ### Collaboration Policy # # Data science is a collaborative activity. While you may talk with others about the homework, we ask that you **write your solutions individually**. If you do discuss the assignments with others please **include their names** in the collaborators cell below. # **Collaborators:** *write names here* # ## Introduction # # This assignment will continue from where we left off in in Homework 5. Recall that the linear model that you created failed to produce accurate estimates of the observed housing prices because the model was too simple. The goal of this homework is to guide you through the iterative process of specifying, fitting, and analyzing the performance of more complex linear models used to predict prices of houses in Ames, Iowa. Additionally, you will have the opportunity to choose your own features and create your own regression model! # # By the end of this homework, you should feel comfortable: # # 1. Identifying informative variables through EDA # 2. Feature engineering categorical variables # 3. Using sklearn to build more complex linear models #
# # Welcome to Data Science 8.1X Foundations of Data Science: Computational Thinking with Python! Each week you will complete a lab assignment like this one. In this course, you will learn basics of computational thinking, an essential skill in today’s data-driven world, using the popular programming language, Python. You can't learn technical subjects without hands-on practice, so labs are an important part of the course. # # Please complete this notebook by filling in the cells provided. Before you begin, execute the following cell to load the provided autograder tests. If you pass all the autograder tests, you will receive full credit for the lab. # # In this lab, you'll get started with the Python programming language through numbers, names, and expressions. You'll also get a brief introduction to tables. # In[ ]: import numpy as np from datascience import * # These lines load the tests. from client.api.notebook import Notebook ok = Notebook('lab01.ok') # # 1. Numbers # # Quantitative information arises everywhere in data science. In addition to representing commands to print out lines, expressions can represent numbers and methods of combining numbers. The expression `3.2500` evaluates to the number 3.25. (Run the cell and see.) # In[66]: 3.2500 # Notice that we didn't have to `print`. When you run a notebook cell, if the last line has a value, then Jupyter helpfully prints out that value for you. However, it won't print out prior lines automatically. # In[ ]:
#!/usr/bin/env python # coding: utf-8 # In[1]: # Initialize OK from client.api.notebook import Notebook ok = Notebook('proj2b.ok') # # Project 2 Part B: Spam/Ham Classification # ## Classifiers # ### The assignment is due on Monday, April 27th at 11:59pm PST. # # **Collaboration Policy** # # Data science is a collaborative activity. While you may talk with others about # the project, we ask that you **write your solutions individually**. If you do # discuss the assignments with others please **include their names** at the top # of your notebook. # **Collaborators**: *list collaborators here* # ## This Assignment # In Project 2 Part A, you made an effort to understand the data through EDA, and did some basic feature engineering. You also built a Logistic Regression model to classify Spam/Ham emails. In Part B, you will learn how to evaluate the classifiers you built. You will also have the chance to improve your model by selecting more features. # # ## Warning # We've tried our best to filter the data for anything blatantly offensive as best as we can, but unfortunately there may still be some examples you may find in poor taste. If you encounter these examples and believe it is inappropriate for students, please let a TA know and we will try to remove it for future semesters. Thanks for your understanding! # # ## Score Breakdown
# Welcome to lab 3! # # This week, we will focus on manipulating tables. Tables are described in [Chapter 6](http://www.inferentialthinking.com/chapters/06/tables.html) of the text. # # First, set up the tests and imports by running the cell below. # In[3]: import numpy as np from datascience import * # These lines load the tests. from client.api.notebook import Notebook ok = Notebook('lab03.ok') # ## 1. Introduction # # For a collection of things in the world, an array is useful for describing a single attribute of each thing. For example, among the collection of US States, an array could describe the land area of each. Tables extend this idea by describing multiple attributes for each element of a collection. # # In most data science applications, we have data about many entities, but we also have several kinds of data about each entity. # # For example, in the cell below we have two arrays. The first one contains the world population in each year (as [estimated](http://www.census.gov/population/international/data/worldpop/table_population.php) by the US Census Bureau), and the second contains the years themselves (in order, so the first elements in the population and the years arrays correspond). # In[4]: population_amounts = Table.read_table("world_population.csv").column( "Population") years = np.arange(1950, 2015 + 1) print("Population column:", population_amounts)
# modules for research report from datascience import * import numpy as np import random import pandas as pd import folium get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plots plots.style.use('fivethirtyeight') # module for YouTube video from IPython.display import YouTubeVideo # okpy config from client.api.notebook import Notebook ok = Notebook('airbnb-final-project.ok') _ = ok.auth(inline=True) # # Airbnb Listings and Evictions # # The dataset you will be using is from [Inside Airbnb](http://insideairbnb.com/get-the-data.html), an independent investigatory project that # collects and hosts substantial Airbnb data on more than 100 cities around the world. The data collected by Inside Airbnb are web-scraped from # the Airbnb website on a monthly basis. Inside Airbnb was started to investigate the effects of Airbnb on affordable housing and gentrification. # Its data are made public for free and open for use. # # We have prepared for you a random subset of Inside Airbnb data from San Francisco collected in June 2020. The data have been # cleaned for your convenience: all missing values have been removed, and low-quality observations and variables have been filtered # out. A brief descriptive summary of the dataset is provided below. # # We are aware that this dataset is potentially significantly larger (in both rows and columns) than other datasets for the project. As a result, # you will have many potential directions to conduct your analysis in. At the same time, it is very easy to become overwhelmed or lost with the data.