# In[17]: print(test_text_prepare()) # Run your implementation for questions from file *text_prepare_tests.tsv* to earn the points. # In[18]: prepared_questions = [] for line in open('data/text_prepare_tests.tsv'): line = text_prepare(line.strip()) prepared_questions.append(line) text_prepare_results = '\n'.join(prepared_questions) grader.submit_tag('TextPrepare', text_prepare_results) # Now we can preprocess the titles using function *text_prepare* and making sure that the headers don't have bad symbols: # In[19]: X_train = [text_prepare(x) for x in X_train] X_val = [text_prepare(x) for x in X_val] X_test = [text_prepare(x) for x in X_test] # In[20]: X_train[:3] # For each tag and for each word calculate how many times they occur in the train corpus. #
# # *Hints:* # # * Sometimes items are returned, find such examples in the dataset. # * It is handy to split `date` field into [`day`, `month`, `year`] components and use `df.year == 14` and `df.month == 9` in order to select target subset of dates. # * You may work with `date` feature as with srings, or you may first convert it to `pd.datetime` type with `pd.to_datetime` function, but do not forget to set correct `format` argument. # In[9]: filtered_trans = transactions[transactions.date.str.endswith('09.2014')] filtered_trans['total_profit_or_loss'] = filtered_trans[ 'item_price'] * filtered_trans['item_cnt_day'] max_revenue = filtered_trans.groupby( 'shop_id')['total_profit_or_loss'].sum().max() grader.submit_tag('max_revenue', max_revenue) # Great! Let's move on and answer another question: # # <ol start="2"> # <li><b>What item category generated the highest revenue in summer 2014?</b></li> # </ol> # # * Submit `id` of the category found. # # * Here we call "summer" the period from June to August. # # *Hints:* # # * Note, that for an object `x` of type `pd.Series`: `x.argmax()` returns **index** of the maximum element. `pd.Series` can have non-trivial index (not `[1, 2, 3, ... ]`).
examples = ["SQL Server - any equivalent of Excel's CHOOSE function?", "How to free c++ memory vector<int> * arr?"] answers = ["sql server equivalent excels choose function", "free c++ memory vectorint arr"] for ex, ans in zip(examples, answers): if text_prepare(ex) != ans: return "Wrong answer for the case: '%s'" % ex return 'Basic tests are passed.' prepared_questions = [] for line in open('data/text_prepare_tests.tsv', encoding='utf-8'): line = text_prepare(line.strip()) prepared_questions.append(line) text_prepare_results = '\n'.join(prepared_questions) grader.submit_tag('TextPrepare', text_prepare_results) X_train = [text_prepare(x) for x in X_train] X_val = [text_prepare(x) for x in X_val] X_test = [text_prepare(x) for x in X_test] # Dictionary of all tags from train corpus with their counts. tags_counts = {} # Dictionary of all words from train corpus with their counts. words_counts = {} temp = {} for posttag in train['tags']: for tag in posttag: if tag in temp: temp[tag] = temp.get(tag) + 1
X_tr, X_val = test_df.iloc[tr_ind], test_df.iloc[val_ind] means = X_val['item_id'].map(X_tr.groupby('item_id').target.mean()) X_val['item_id_target_mean'] = means test_df.iloc[val_ind] = X_val prior = test_df['target'].mean() test_df.fillna(prior, inplace = True) corr = np.corrcoef(all_data['target'].values, test_df['item_id_target_mean'])[0][1] # You will need to compute correlation like that #corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1] #corr = 0.41645904885340546 #%% print(corr) #%% Grader.submit_tag('KFold_scheme', corr) # %% # 2. Leave-one-out-scheme test_df = all_data test_df['item_target_sum'] = test_df.groupby('item_id')['target'].sum() # %% test_df.head() #%% for i in range(0,len(test_df)): print(i) #%% test_df['LOOCV'] = 0
for train_index, val_index in kfold.split(all_data): X_train = all_data.iloc[train_index] X_val = all_data.iloc[val_index] item_id_target_mean = X_train.groupby('item_id').target.mean() X_val['item_target_enc'] = X_val['item_id'].map(item_id_target_mean) all_data.iloc[val_index] = X_val all_data['item_target_enc'].fillna(0.3343, inplace=True) encoded_feature = all_data['item_target_enc'].values # In[27]: corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1] print(corr) grader.submit_tag('KFold_scheme', corr) # # 2. Leave-one-out scheme # Now, implement leave-one-out scheme. Note that if you just simply set the number of folds to the number of samples and run the code from the **KFold scheme**, you will probably wait for a very long time. # # To implement a faster version, note, that to calculate mean target value using all the objects but one *given object*, you can: # # 1. Calculate sum of the target values using all the objects. # 2. Then subtract the target of the *given object* and divide the resulting value by `n_objects - 1`. # # Note that you do not need to perform `1.` for every object. And `2.` can be implemented without any `for` loop. # # It is the most convenient to use `.transform` function as in **Method 2**. # In[13]:
print(question_to_vec_tests()) import nltk nltk.download('stopwords') from util import array_to_string question2vec_result = [] for question in open('data/test_embeddings.tsv'): question = question.strip() answer = question_to_vec(question, wv_embeddings) question2vec_result = np.append(question2vec_result, answer) grader.submit_tag('Question2Vec', array_to_string(question2vec_result)) def hits_count(dup_ranks, k): """ dup_ranks: list of duplicates' ranks; one rank per question; length is a number of questions which we are looking for duplicates; rank is a number from 1 to len(candidates of the question); e.g. [2, 3] means that the first duplicate has the rank 2, the second one — 3. k: number of top-ranked elements (k in Hits@k metric) result: return Hits@k value for current ranking """ count = 0 for i in dup_ranks: if i <= k:
print('Test R-squared for stacking is %f' % r2_test_stacking) # In[ ]: # Interesting, that the score turned out to be lower than in previous method. Although the model is very simple (just 3 parameters) and, in fact, mixes predictions linearly, it looks like it managed to overfit. **Examine and compare** train and test scores for the two methods. # # And of course this particular case does not mean simple mix is always better than stacking. # We all done! Submit everything we need to the grader now. # In[53]: from grader import Grader grader = Grader() grader.submit_tag('best_alpha', best_alpha) grader.submit_tag('r2_train_simple_mix', r2_train_simple_mix) grader.submit_tag('r2_test_simple_mix', r2_test_simple_mix) grader.submit_tag('r2_train_stacking', r2_train_stacking) grader.submit_tag('r2_test_stacking', r2_test_stacking) # In[54]: STUDENT_EMAIL = '' STUDENT_TOKEN = '' grader.status() # In[55]: