def test_nb_grade_simple_valid():
    Test parsing & running a simple oktest file.
    here = os.path.dirname(__file__)

    nb = Notebook(os.path.join(here, 'oktests/simple.ok'))

class WandbTrackedOK(object):

    def __init__(self, entity, path, project):
        self.grader = Notebook(path)
        wandb.init(entity=entity, project=project, anonymous="must")
        self.test_map = self.grader.assignment.test_map
        self.pass_dict = {k: 0 for k in self.test_map}

    def grade(self, question, *args, **kwargs):
        result = self.grader.grade(question, *args, **kwargs)
        self.pass_dict[question] = result["passed"]

    def log(self):
        total = sum([v for v in self.pass_dict.values()])
        wandb.log({"passes": self.pass_dict,
                   "total": total})

    def __delete__(self):
    return classify(row, train_20, train_movies.column("Genre"), 3)

new_test_guesses = test_20.apply(another_classifier)
new_proportion_correct = np.count_nonzero(new_test_guesses == test_movies.column("Genre")) / test_movies.num_rows

# Briefly describe what you tried to improve your classifier. As long as you put in some effort to improving your classifier and describe what you have done, you will receive full credit for this problem.

# Original prediction 73.
# I first tried to manipulate the values of K, and when I increased it to 20 my prediction dropped to 67.5 and when I decreased it to 7 my prediction increased to 78. When that did not help out my predication, I then tried to append the staff features to my original 20, and that ended up not helping at all either, lowering my prediction to 46. Lastly, I just ended up using the staff variables and there was again no change in my prediction and that brought my prediction down to 59. Since the only increase I got was by dropping the K value to 7 I ended up dropping it to 3 and it again increased my prediction to the same value as 7's 78.

# Congratulations: you're done with the required portion of the project! Time to submit.

_ = ok.submit()

import os
print("Running all tests...")
_ = [ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q')]
print("Finished running all tests.")

seconds_in_a_decade = ...

# We've put this line in this cell so that it will print
# the value you've given to seconds_in_a_decade when you
# run it.  You don't need to change this.

# <img src="" alt="comic about comments">

# ## 2.3. Application: A physics experiment
# <!--
# name: q1_1
# manual: false
# -->

all_unique_causes = np.unique(causes_of_death.column("Cause"))

# This function may be useful for Question 2.
def elem(x):
    return x.item(0)

# **Question 2:** We would like to plot the death rate for each disease over time. To do so, we must create a table with one column for each cause and one row for each year.
# Create a table called `causes_for_plotting`. It should have one column called `Year`, and then a column with age-adjusted death rates for each of the causes you found in Question 1. There should be as many of these columns in `causes_for_plotting` as there are causes in Question 1.
# *Hint*: Use `pivot`, and think about how the `elem` function might be useful in getting the **Age Adjusted Death Rate** for each cause and year combination.
settings.new_year = max(a,b)

# Check your work by executing the next cell.

settings.woman_asking = ""
woman_quote = '"Can it be that you have come from outer space?"'
gagarin_reply = 'Gagarin replied:'
    'The Shawshank Redemption (1994)', 'The Godfather (1972)',
    'The Godfather: Part II (1974)', 'Pulp Fiction (1994)',
    "Schindler's List (1993)",
    'The Lord of the Rings: The Return of the King (2003)',
    '12 Angry Men (1957)', 'The Dark Knight (2008)',
    'Il buono, il brutto, il cattivo (1966)',
    'The Lord of the Rings: The Fellowship of the Ring (2001)')

top_10_movies = ...
# We've put this next line here so your table will get printed out when you
# run this cell.

imdb = ...

import sqlite3

conn = sqlite3.connect('taxi.db')
lon_bounds = [-74.03, -73.75]
lat_bounds = [40.6, 40.88]

squery = "SELECT * FROM taxi WHERE pickup_lon BETWEEN {} AND {} AND dropoff_lon  BETWEEN {} AND {} AND pickup_lat BETWEEN {} AND {} AND dropoff_lat BETWEEN {} AND {}".format(
    lon_bounds[0], lon_bounds[1], lon_bounds[0], lon_bounds[1], lat_bounds[0],
    lat_bounds[1], lat_bounds[0], lat_bounds[1])

all_taxi = pd.read_sql(squery, conn)

# A scatter plot of pickup locations shows that most of them are on the island of Manhattan. The empty white rectangle is Central Park; cars are not allowed there.

# In[5]:

def pickup_scatter(t):
    plt.scatter(t['pickup_lon'], t['pickup_lat'], s=2, alpha=0.2)
    plt.title('Pickup locations')

plt.figure(figsize=(8, 8))
# <img src="" alt="comic about comments">

source = [i['source'] for i in all_tweets]
text = [i['text'] if 'text' in i else i['full_text'] for i in all_tweets]
retweet_count = [i['retweet_count'] for i in all_tweets]
trump = pd.DataFrame(
        'time': time,
        'source': source,
        'text': text,
        'retweet_count': retweet_count

# ---
# # Part 2: Tweet Source Analysis
# In the following questions, we are going to find out the charateristics of Trump tweets and the devices used for the tweets.
# First let's examine the source field:

# ## Question 2
# Notice how sources like "Twitter for Android" or "Instagram" are surrounded by HTML tags. In the cell below, clean up the `source` field by removing the HTML tags from each `source` entry.
# ### Part 1a: Looking Inside and Extracting the Zip Files

my_zip = zipfile.ZipFile(file=dest_path, mode='r')
data_dir_path = Path(
    'data')  # creates a Path object that points to the data directory
list_names = [ for x in data_dir_path.glob('*') if x.is_file()]

from pathlib import Path
data_dir = Path('data')
get_ipython().system('ls {data_dir}')

# The cell above created a folder called `data`, and in it there should be four CSV files. Open up `legend.csv` to see its contents.

# ### Part 1b: Programatically Looking Inside the Files

ds100_utils.head('data/businesses.csv', 5)
ds100_utils.head('data/inspections.csv', 5)
ds100_utils.head('data/legend.csv', 5)
ds100_utils.head('data/violations.csv', 5)
b_pop_1 ='time', 'population_total', 'geo')
b_pop = b_pop_1.where('geo', are.equal_to('bgd')).drop('geo').where(
    'time', are.above_or_equal_to(1970)).where('time',

# Run the following cell to create a table called `b_five` that has the population of Bangladesh every five years. At a glance, it appears that the population of Bangladesh has been growing quickly indeed!

b_pop.set_format('population_total', NumberFormatter)

fives = np.arange(1970, 2016, 5)  # 1970, 1975, 1980, ...
b_five = b_pop.sort('time').where('time', are.contained_in(fives))

def first(values):
    return values.item(0)

latest = ...

latest.relabel(0, 'geo').relabel(1, 'time').relabel(
    2, 'poverty_percent')  # You should *not* change this line.

poverty_and_pop = ...
recent = ...

      a list of the top n richest neighborhoods as measured by the metric function
    table = data[["Neighborhood", "SalePrice"]].groupby("Neighborhood").agg(metric).sort_values("SalePrice", ascending = False)
    neighborhoods = [i for i in table.iloc[:n].index]
    return neighborhoods

rich_neighborhoods = find_rich_neighborhoods(training_data, 3, np.median)

# ### Question 1c <a name="q1c"></a> 
# We now have a list of neighborhoods we've deemed as richer than others.  Let's use that information to make a new variable `in_rich_neighborhood`.  Write a function `add_rich_neighborhood` that adds an indicator variable which takes on the value 1 if the house is part of `rich_neighborhoods` and the value 0 otherwise.
# **Hint:** [`pd.Series.astype`]( may be useful for converting True/False values to integers.
# *The provided tests check that you answered correctly, so that future analyses are not corrupted by a mistake.*
bottom_left = 1

# What properties does a word in the bottom right corner have?

bottom_right = 3

# What properties does a word in the top right corner have?

top_right = 4

# name: q6a
# points: 1
# -->

zero_predictor_fp = 0
zero_predictor_fn = sum(Y_train == 1)

# ### Question 6b
# What are the accuracy and recall of `zero_predictor` (classifies every email as ham) on the training set? Do **NOT** use any `sklearn` functions.
# <!--
# name: q6b
# points: 1
# -->

new_year = ...

# Check your work by executing the next cell.

print("I <3", 'Data Science')

# Just like names can be given to numbers, names can be given to string values.  The names and strings aren't required to be similar in any way. Any name can be assigned to any string.
# <!--
# name: q1
# points: 3
# -->

# These should be True or False
q1statement1 = False
q1statement2 = True
q1statement3 = True

# ### SalePrice vs Gr_Liv_Area
# Next, we visualize the association between `SalePrice` and `Gr_Liv_Area`.  The `codebook.txt` file tells us that `Gr_Liv_Area` measures "above grade (ground) living area square feet."
# This variable represents the square footage of the house excluding anything underground.  Some additional research (into real estate conventions) reveals that this value also excludes the garage space.

#def check_get_hashtags(file,hashtag,answer):
#    with open(file) as json_file:
#        statuses = json.load(json_file)
#    other_hashtags = get_hashtags(statuses, hashtag)
#    #print(other_hashtags)
#    other_hashtags = [s.replace('#', '') for s in other_hashtags]
#    if other_hashtags==answer:
#        return True
#    else:
#        return False

ok = Notebook(cf['ok_file'])
_ = ok.auth(inline=False)
results = {
    q[:-3]: ok.grade(q[:-3])
    for q in os.listdir("tests") if q.startswith('q')

import autograde as ag

def output_tests(cf, results):
    autograde = {}
    autograde['github_id'] = cf['github_id']
    #This is a selection of variables from config file.
    for s in cf['variables']:
        if s in globals():
            autograde[s] = eval(s)
temp = np.arange(1970, 2016)
bgd = ["bgd"]
holder = population.where("geo", are.contained_in(bgd))
b_pop = holder.where("time", are.contained_in(temp))
b_pop = b_pop.drop("geo")

# Run the following cell to create a table called `b_five` that has the population of Bangladesh every five years. At a glance, it appears that the population of Bangladesh has been growing quickly indeed!

b_pop.set_format('population_total', NumberFormatter)

fives = np.arange(1970, 2016, 5)  # 1970, 1975, 1980, ...
b_five = b_pop.sort('time').where('time', are.contained_in(fives))

# <!--
# name: q1ci
# points: 3
# -->

ins_named = ins.merge(bus[['name', 'address', 'bid']], how='left')


worst_restaurant = ins_named[['score',

# **Use the cell above to identify the restaurant** with the lowest inspection scores ever. Be sure to include the name of the restaurant as part of your answer in the cell below. You can also head to and look up the reviews page for this restaurant. Feel free to add anything interesting you want to share.
# <!--
# name: q1cii
# points: 1
# manual: True
# #### Question 1.1
# Set `expected_row_sum` to the number that you __expect__ will result from summing all proportions in each row, excluding the first six columns.
# <!--
# name: q1_1
# -->

# Set row_sum to a number that's the (approximate) sum of each row of word proportions.
expected_row_sum = 1

# This dataset was extracted from [a dataset from Cornell University]( After transforming the dataset (e.g., converting the words to lowercase, removing the naughty words, and converting the counts to frequencies), we created this new dataset containing the frequency of 5000 common words in each movie.

print('Words with frequencies:', movies.drop(np.arange(6)).num_columns)
print('Movies with genres:', movies.num_rows)

np.average(raw_compensation.column("Total Pay"))

# You should see an error. Let's examine why this error occured by looking at the values in the "Total Pay" column. Use the `type` function and set `total_pay_type` to the type of the first value in the "Total Pay" column.

total_pay_type = ...

mark_hurd_pay_string = ...

_ = ok.grade('q1_2')
new_year = ...

# Check your work by executing the next cell.

print("I <3", 'Data Science')

# Just like names can be given to numbers, names can be given to string values.  The names and strings aren't required to be similar in any way. Any name can be assigned to any string.
# <!--
# name: q1_0
# -->

# Set row_sum to a number that's the (approximate) sum of each row of word proportions.
expected_row_sum = 1

# This dataset was extracted from [a dataset from Cornell University]( After transforming the dataset (e.g., converting the words to lowercase, removing the naughty words, and converting the counts to frequencies), we created this new dataset containing the frequency of 5000 common words in each movie.

print('Words with frequencies:', movies.drop(np.arange(5)).num_columns) 
print('Movies with genres:', movies.num_rows)

train_email_nan = original_training_data['subject'].isna()
original_training_data['email'] = original_training_data['email'].fillna("")

#Test Set
test_subject_nan = test['subject'].isna()
test['subject'] = test['subject'].fillna("")

test_email_nan = test['subject'].isna()
test['email'] = test['email'].fillna("")

# ### Question 1b
total_pay_type = ...

mark_hurd_pay_string = ...

    'The Godfather: Part II (1974)', 'Pulp Fiction (1994)',
    "Schindler's List (1993)",
    'The Lord of the Rings: The Return of the King (2003)',
    '12 Angry Men (1957)', 'The Dark Knight (2008)',
    'Il buono, il brutto, il cattivo (1966)',
    'The Lord of the Rings: The Fellowship of the Ring (2001)')

top_10_movies = Table().with_columns("Rating", top_10_movie_ratings, "Name",
# We've put this next line here so your table will get printed out when you
# run this cell.

imdb = Table.read_table("imdb.csv")

        if pd.isnull(value):
            x.iloc[index] = 'Missing'
    return x

for i in original_training_data.columns:
    post_missing += [1 for k in original_training_data[i] if pd.isnull(k)]
print(f"There are now " + str(sum(post_missing)) + " missing values.")

# ### Question 1b
