Beispiel #1
0
def test_wordentail_experiment(wordentail_data, condition):
    nli.wordentail_experiment(
        train_data=wordentail_data[condition]['train'],
        assess_data=wordentail_data[condition]['dev'],
        vector_func=lambda x: np.ones(10),
        vector_combo_func=lambda u, v: np.concatenate((u, v)),
        model=TorchShallowNeuralClassifier(hidden_dim=5, max_iter=1))
Beispiel #2
0
    def system_1():

        # Data------------
        with open(wordentail_filename) as f:
            wordentail_data = json.load(f)

        print("Distribution of labels : \n{0}".format(
            pd.DataFrame(
                wordentail_data['word_disjoint']['train'])[1].value_counts()))

        def vec_merge(u, v):
            """Merge different feature reps including array diff, max, avg etc."""
            return np.concatenate((u, v, vec_diff(u, v), vec_max(u, v)))

        # Model-----------
        net = TorchShallowNeuralClassifier(hidden_dim=50, max_iter=100)
        print(net)

        # Exp-------------
        result = nli.wordentail_experiment(
            train_data=wordentail_data['word_disjoint']['train'],
            assess_data=wordentail_data['word_disjoint']['dev'],
            model=net,
            vector_func=glove_vec,
            vector_combo_func=vec_merge)

        return result['macro-F1']
Beispiel #3
0
    def system_0_original():

        # Data------------
        with open(wordentail_filename) as f:
            wordentail_data = json.load(f)

        print("Distribution of labels : \n{0}".format(
            pd.DataFrame(
                wordentail_data['word_disjoint']['train'])[1].value_counts()))

        # Model-----------
        X_glove = pd.DataFrame(GLOVE)
        X_glove['$UNK'] = 0
        X_glove = X_glove.T

        vocab = list(X_glove.index)
        embedding = X_glove.values
        net = TorchRNNClassifier(vocab=vocab,
                                 embedding=embedding,
                                 bidirectional=True)

        # Exp-------------
        result = nli.wordentail_experiment(
            train_data=wordentail_data['word_disjoint']['train'],
            assess_data=wordentail_data['word_disjoint']['dev'],
            model=net,
            vector_func=lambda x: np.array([x]),
            vector_combo_func=vec_concatenate)

        return result['macro-F1']
Beispiel #4
0
def test_bakeoff_experiment(wordentail_data):
    word_disjoint_experiment = nli.wordentail_experiment(
        train_data=wordentail_data['train'],
        assess_data=wordentail_data['dev'],
        vector_func=lambda x: np.ones(10),
        vector_combo_func=lambda u, v: np.concatenate((u, v)),
        model=TorchShallowNeuralClassifier(hidden_dim=5, max_iter=1))

    test_data_filename = os.path.join('data', 'nlidata',
                                      'bakeoff-wordentail-data',
                                      'nli_wordentail_bakeoff_data-test.json')

    nli.bake_off_evaluation(word_disjoint_experiment, test_data_filename)
Beispiel #5
0
def run_hypothesis_only_evaluation():
    ##### YOUR CODE HERE
    from sklearn.linear_model import LogisticRegression

    result = {}
    for condition_name in ['word_disjoint', 'edge_disjoint']:
        for combo_fn in [vec_concatenate, hypothesis_only]:
            result[(condition_name, combo_fn.__name__)] = nli.wordentail_experiment(
                train_data=wordentail_data[condition_name]['train'],
                assess_data=wordentail_data[condition_name]['dev'], 
                model=LogisticRegression(), 
                vector_func=glove_vec,
                vector_combo_func=combo_fn)['macro-F1']
    return result
Beispiel #6
0
    def system_2():

        # Data------------
        with open(wordentail_filename) as f:
            wordentail_data = json.load(f)

        X_glove = pd.DataFrame(GLOVE).T
        print(X_glove.shape)

        def convert_edges_to_indices(edges, Q):
            lookup = dict(zip(Q.index, range(Q.shape[0])))
            index_edges = defaultdict(set)
            for start, finish_nodes in edges.items():
                s = lookup.get(start)
                if s:
                    f = {lookup[n] for n in finish_nodes if n in lookup}
                    if f:
                        index_edges[s] = f
            return index_edges

        wn_index_edges = convert_edges_to_indices(wn_edges, X_glove)
        wn_retro = Retrofitter(verbose=True)
        X_retro = wn_retro.fit(X_glove, wn_index_edges)
        print(X_retro.shape)

        def retro_vec(w):
            """Return `w`'s Retrofitted representation if available, else return 
            a random vector."""
            return X_retro.loc[w].values if w in X_retro.index else randvec(
                w, n=glove_dim)

        # Model-----------
        net = TorchShallowNeuralClassifier(hidden_dim=50, max_iter=100)
        print(net)

        # Exp-------------
        result = nli.wordentail_experiment(
            train_data=wordentail_data['word_disjoint']['train'],
            assess_data=wordentail_data['word_disjoint']['dev'],
            model=net,
            vector_func=retro_vec,
            vector_combo_func=vec_concatenate)

        return result['macro-F1']
def run_hypothesis_only_evaluation():
    ##### YOUR CODE HERE
    # pass
    from sklearn.linear_model import LogisticRegression

    eval_results = {}
    net = LogisticRegression()
    for condition_name in ['edge_disjoint', 'word_disjoint']:
        for vec_combo_func in [vec_concatenate, hypothesis_only]:
            result = nli.wordentail_experiment(
                train_data=wordentail_data[condition_name]['train'],
                assess_data=wordentail_data[condition_name]['dev'],
                model=net,
                vector_func=glove_vec,
                vector_combo_func=vec_combo_func)

            eval_results[(condition_name,
                          vec_combo_func.__name__)] = result['macro-F1']

    return eval_results
def run_hypothesis_only_evaluation():
    ##### YOUR CODE HERE
    import sklearn
    model = sklearn.linear_model.LogisticRegression()

    datasets = ['word_disjoint', 'edge_disjoint']
    v_funcs = [vec_concatenate, hypothesis_only]

    result = defaultdict()

    for data in datasets:
        for v_func in v_funcs:
            word_experiment = nli.wordentail_experiment(
                train_data=wordentail_data[data]['train'],
                assess_data=wordentail_data[data]['dev'],
                model=model,
                vector_func=glove_vec,
                vector_combo_func=v_func)

            result[(data, v_func.__name__)] = word_experiment['macro-F1']

    return result
Beispiel #9
0
    def system_3():

        # Data------------
        with open(wordentail_filename) as f:
            wordentail_data = json.load(f)

        x_train = wordentail_data['word_disjoint']['train']
        print("Existing distribution of labels : \n{0}".format(
            pd.DataFrame(x_train)[1].value_counts()))

        # get wordnet edges
        def get_wordnet_edges():
            edges = defaultdict(set)
            for ss in wn.all_synsets():
                lem_names = {lem.name() for lem in ss.lemmas()}
                for lem in lem_names:
                    edges[lem] |= lem_names
            return edges

        wn_edges = get_wordnet_edges()

        # data augmentation of positive entailments.
        positive_entailments = []
        for premise_hypothesis, label in x_train:
            if label == 1:
                positive_entailments.append(premise_hypothesis)

        print("Current count of positives: {0}".format(
            len(positive_entailments)))

        positive_entailments_ex = []
        for premise_hypothesis in positive_entailments:
            premise = premise_hypothesis[0]
            hypothesis = premise_hypothesis[1]

            for wn_premise in wn_edges[premise]:
                if premise == wn_premise:
                    continue
                for wn_hypothesis in wn_edges[hypothesis]:
                    if wn_hypothesis == hypothesis:
                        continue

                    positive_entailments_ex.append([wn_premise, wn_hypothesis])

        print("New count of positives to add: {0}".format(
            len(positive_entailments_ex)))
        x_train.extend([[item, 1] for item in positive_entailments_ex])

        print("New distribution of labels : \n{0}".format(
            pd.DataFrame(
                wordentail_data['word_disjoint']['train'])[1].value_counts()))

        # Model-----------
        net = TorchShallowNeuralClassifier(hidden_dim=50, max_iter=100)

        # Exp-------------
        result = nli.wordentail_experiment(
            train_data=wordentail_data['word_disjoint']['train'],
            assess_data=wordentail_data['word_disjoint']['dev'],
            model=net,
            vector_func=glove_vec,
            vector_combo_func=vec_concatenate)

        return result['macro-F1']
Beispiel #10
0
#
# For a baseline model, I chose `TorchShallowNeuralClassifier`:

# In[20]:

net = TorchShallowNeuralClassifier(hidden_dim=50, max_iter=100)

# ### Baseline results
#
# The following puts the above pieces together, using `vector_func=glove_vec`, since `vector_func=randvec` seems so hopelessly misguided for `word_disjoint`!

# In[21]:

word_disjoint_experiment = nli.wordentail_experiment(
    train_data=wordentail_data['word_disjoint']['train'],
    assess_data=wordentail_data['word_disjoint']['dev'],
    model=net,
    vector_func=glove_vec,
    vector_combo_func=vec_concatenate)

print("macro-f1: {0}".format(word_disjoint_experiment['macro-F1']))

# ## Homework questions
#
# Please embed your homework responses in this notebook, and do not delete any cells from the notebook. (You are free to add as many cells as you like as part of your responses.)

# ### Hypothesis-only baseline [2 points]
#
# During our discussion of SNLI and MultiNLI, we noted that a number of research teams have shown that hypothesis-only baselines for NLI tasks can be remarkably robust. This question asks you to explore briefly how this baseline effects the 'edge_disjoint' and 'word_disjoint' versions of our task.
#
# For this problem, submit two functions:
#

def vec_func(w):
    return w.split()


def vec_concatenate(u, v):
    """ hypothesis only baseline """
    return v


print("---------------------------------------------")

word_disjoint_experiment = nli.wordentail_experiment(
    train_data=train_data,
    assess_data=dev_data,
    model=net,
    vector_func=vec_func,
    vector_combo_func=vec_concatenate)

########################################################
# Also produce numbers for MNLI Matched/Mismatched
from nli import word_entail_featurize


def wordentail_assessonly(assess_data, vector_func, vector_combo_func, model):
    X_dev, y_dev = word_entail_featurize(assess_data, vector_func,
                                         vector_combo_func)
    predictions = model.predict(X_dev)
    print(classification_report(y_dev, predictions, digits=3))

Beispiel #12
0

# net = TorchShallowNeuralClassifier(hidden_dim=50, max_iter=100)

X_glove = pd.DataFrame(GLOVE)
X_glove['$UNK'] = 0
X_glove = X_glove.T

vocab = list(X_glove.index)
embedding = X_glove.values
net = TorchRNNClassifier(vocab=vocab, embedding=embedding)

word_disjoint_experiment = nli.wordentail_experiment(
    train_data=wordentail_data['word_disjoint']['train'],
    assess_data=wordentail_data['word_disjoint']['dev'],
    # model=GridSearchCV(net, {'hidden_dim': [25, 50, 100]}, cv=2, scoring='f1_macro'),
    model=net,
    vector_func=lambda x: np.array([x]),
    vector_combo_func=vec_concatenate)

print("macro-f1: {0}".format(word_disjoint_experiment['macro-F1']))

#
# # The outer keys are the  splits plus a list giving the vocabulary for the entire dataset:
#
# # In[ ]:
#
#
# wordentail_data.keys()
#
#