# In[17]:

print(test_text_prepare())

# Run your implementation for questions from file *text_prepare_tests.tsv* to earn the points.

# In[18]:

prepared_questions = []
for line in open('data/text_prepare_tests.tsv'):
    line = text_prepare(line.strip())
    prepared_questions.append(line)
text_prepare_results = '\n'.join(prepared_questions)

grader.submit_tag('TextPrepare', text_prepare_results)

# Now we can preprocess the titles using function *text_prepare* and  making sure that the headers don't have bad symbols:

# In[19]:

X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

# In[20]:

X_train[:3]

# For each tag and for each word calculate how many times they occur in the train corpus.
#
Ejemplo n.º 2
0
#
# *Hints:*
#
# * Sometimes items are returned, find such examples in the dataset.
# * It is handy to split `date` field into [`day`, `month`, `year`] components and use `df.year == 14` and `df.month == 9` in order to select target subset of dates.
# * You may work with `date` feature as with srings, or you may first convert it to `pd.datetime` type with `pd.to_datetime` function, but do not forget to set correct `format` argument.

# In[9]:

filtered_trans = transactions[transactions.date.str.endswith('09.2014')]
filtered_trans['total_profit_or_loss'] = filtered_trans[
    'item_price'] * filtered_trans['item_cnt_day']

max_revenue = filtered_trans.groupby(
    'shop_id')['total_profit_or_loss'].sum().max()
grader.submit_tag('max_revenue', max_revenue)

# Great! Let's move on and answer another question:
#
# <ol start="2">
#   <li><b>What item category generated the highest revenue in summer 2014?</b></li>
# </ol>
#
# * Submit `id` of the category found.
#
# * Here we call "summer" the period from June to August.
#
# *Hints:*
#
# * Note, that for an object `x` of type `pd.Series`: `x.argmax()` returns **index** of the maximum element. `pd.Series` can have non-trivial index (not `[1, 2, 3, ... ]`).
Ejemplo n.º 3
0
    examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
                "How to free c++ memory vector<int> * arr?"]
    answers = ["sql server equivalent excels choose function", 
               "free c++ memory vectorint arr"]
    for ex, ans in zip(examples, answers):
        if text_prepare(ex) != ans:
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'
    
prepared_questions = []
for line in open('data/text_prepare_tests.tsv', encoding='utf-8'):
    line = text_prepare(line.strip())
    prepared_questions.append(line)
text_prepare_results = '\n'.join(prepared_questions)

grader.submit_tag('TextPrepare', text_prepare_results)

X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

# Dictionary of all tags from train corpus with their counts.
tags_counts = {}
# Dictionary of all words from train corpus with their counts.
words_counts = {}

temp = {}
for posttag in train['tags']:
  for tag in posttag:
    if tag in temp:
      temp[tag] = temp.get(tag) + 1
    X_tr, X_val = test_df.iloc[tr_ind], test_df.iloc[val_ind]
    means = X_val['item_id'].map(X_tr.groupby('item_id').target.mean())
    X_val['item_id_target_mean'] = means
    test_df.iloc[val_ind] = X_val
    
prior = test_df['target'].mean()
test_df.fillna(prior, inplace = True)
corr = np.corrcoef(all_data['target'].values, test_df['item_id_target_mean'])[0][1]
# You will need to compute correlation like that
#corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
#corr = 0.41645904885340546
#%%

print(corr)
#%%
Grader.submit_tag('KFold_scheme', corr)

# %%

# 2. Leave-one-out-scheme
test_df = all_data
test_df['item_target_sum'] = test_df.groupby('item_id')['target'].sum()

# %%
test_df.head()

#%%
for i in range(0,len(test_df)):
    print(i)
#%%
test_df['LOOCV'] = 0
Ejemplo n.º 5
0
for train_index, val_index in kfold.split(all_data):
    X_train = all_data.iloc[train_index]
    X_val = all_data.iloc[val_index]

    item_id_target_mean = X_train.groupby('item_id').target.mean()
    X_val['item_target_enc'] = X_val['item_id'].map(item_id_target_mean)
    all_data.iloc[val_index] = X_val

all_data['item_target_enc'].fillna(0.3343, inplace=True)
encoded_feature = all_data['item_target_enc'].values

# In[27]:

corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)
grader.submit_tag('KFold_scheme', corr)

# # 2. Leave-one-out scheme

# Now, implement leave-one-out scheme. Note that if you just simply set the number of folds to the number of samples and run the code from the **KFold scheme**, you will probably wait for a very long time.
#
# To implement a faster version, note, that to calculate mean target value using all the objects but one *given object*, you can:
#
# 1. Calculate sum of the target values using all the objects.
# 2. Then subtract the target of the *given object* and divide the resulting value by `n_objects - 1`.
#
# Note that you do not need to perform `1.` for every object. And `2.` can be implemented without any `for` loop.
#
# It is the most convenient to use `.transform` function as in **Method 2**.

# In[13]:
Ejemplo n.º 6
0

print(question_to_vec_tests())

import nltk

nltk.download('stopwords')
from util import array_to_string

question2vec_result = []
for question in open('data/test_embeddings.tsv'):
    question = question.strip()
    answer = question_to_vec(question, wv_embeddings)
    question2vec_result = np.append(question2vec_result, answer)

grader.submit_tag('Question2Vec', array_to_string(question2vec_result))


def hits_count(dup_ranks, k):
    """
        dup_ranks: list of duplicates' ranks; one rank per question;
                   length is a number of questions which we are looking for duplicates;
                   rank is a number from 1 to len(candidates of the question);
                   e.g. [2, 3] means that the first duplicate has the rank 2, the second one — 3.
        k: number of top-ranked elements (k in Hits@k metric)

        result: return Hits@k value for current ranking
    """
    count = 0
    for i in dup_ranks:
        if i <= k:
Ejemplo n.º 7
0
print('Test  R-squared for stacking is %f' % r2_test_stacking)

# In[ ]:

# Interesting, that the score turned out to be lower than in previous method. Although the model is very simple (just 3 parameters) and, in fact, mixes predictions linearly, it looks like it managed to overfit. **Examine and compare** train and test scores for the two methods.
#
# And of course this particular case does not mean simple mix is always better than stacking.

# We all done! Submit everything we need to the grader now.

# In[53]:

from grader import Grader
grader = Grader()

grader.submit_tag('best_alpha', best_alpha)

grader.submit_tag('r2_train_simple_mix', r2_train_simple_mix)
grader.submit_tag('r2_test_simple_mix', r2_test_simple_mix)

grader.submit_tag('r2_train_stacking', r2_train_stacking)
grader.submit_tag('r2_test_stacking', r2_test_stacking)

# In[54]:

STUDENT_EMAIL = ''
STUDENT_TOKEN = ''
grader.status()

# In[55]: