# In[7]: wordentail_data['edge_disjoint']['dev'][:5] # Let's test to make sure no edges are shared between `train` and `dev`: # In[8]: nli.get_edge_overlap_size(wordentail_data, 'edge_disjoint') # As we expect, a *lot* of vocabulary items are shared between `train` and `dev`: # In[9]: nli.get_vocab_overlap_size(wordentail_data, 'edge_disjoint') # This is a large percentage of the entire vocab: # In[10]: len(wordentail_data['vocab']) # Here's the distribution of labels in the `train` set. It's highly imbalanced, which will pose a challenge for learning. (I'll go ahead and reveal that the `dev` set is similarly distributed.) # In[11]: def label_distribution(split): return pd.DataFrame(wordentail_data[split]['train'])[1].value_counts()
def test_vocab_overlap_size(wordentail_data, split, count): result = nli.get_vocab_overlap_size(wordentail_data, split) assert result == count
def test_vocab_overlap_size(wordentail_data): result = nli.get_vocab_overlap_size(wordentail_data) assert result == 0