Esempio n. 1
0
import random
from pathlib import Path

from babeval.vocab import get_vocab

NUM_NOUNS_FROM_EACH_LIST = 400  # there are only 414 plurals

template1 = 'where [MASK] the {} go ?'
template2 = 'what [MASK] the {} do ?'

nouns_plural = (Path(__file__).parent / 'word_lists' / 'nouns_plural_annotator2.txt').open().read().split()
nouns_plural = [w for w in nouns_plural if w in get_vocab()]

nouns_singular = (Path(__file__).parent / 'word_lists' / 'nouns_singular_annotator2.txt').open().read().split()
nouns_singular = [w for w in nouns_singular if w in get_vocab()]


def main():
    random.seed(3)

    nouns_balanced = random.sample(nouns_singular, k=NUM_NOUNS_FROM_EACH_LIST) + \
                     random.sample(nouns_plural, k=NUM_NOUNS_FROM_EACH_LIST)

    for noun in nouns_balanced:
        yield template1.format(noun)
        yield template2.format(noun)
Esempio n. 2
0
"""
this script calculates human-machine agreement using cohen's kappa

"""

from sklearn.metrics import cohen_kappa_score

from babeval.vocab import get_vocab, classify_vocab

nouns_annotator2 = open('babeval/agreement_across_adjectives/word_lists/nouns_annotator2.txt', 'r').read().split()

nouns_singular_nltk = classify_vocab(get_vocab())['nouns_singular']
nouns_singular_ann2 = open('babeval/agreement_across_RC/word_lists/nouns_singular_annotator2.txt', 'r').read().split()

y1 = []
y2 = []
for w in nouns_annotator2:

    y1i = "-" if w in nouns_singular_nltk else "P"
    y2i = "-" if w in nouns_singular_ann2 else "P"

    y1.append(y1i)
    y2.append(y2i)
    print(f'{w:<16} {y1i} {y2i}')

ck = cohen_kappa_score(y1, y2)
print(ck)
Esempio n. 3
0
from typing import List, Dict

from babeval import configs
from babeval.vocab import get_vocab


def to_percentile(val: float):
    return int(val - (val % 10) + 10)


vocab = get_vocab()

# load bigrams
bigram2percentile = {}
bigram2f = {}
w2max_left_bigram_f = {}
w2max_right_bigram_f = {}
left_w2right_w2f = {}
right_w2_left_w2f = {}
with (configs.Dirs.root / 'word_lists' / 'bi-grams.txt').open() as f:
    for line in f.readlines():
        frequency, w1, w2, percent = line.split()
        frequency = int(frequency)
        bigram2percentile[(w1, w2)] = to_percentile(float(percent))
        bigram2f[(w1, w2)] = frequency
        #
        if frequency > w2max_left_bigram_f.setdefault(w1, 0):
            w2max_left_bigram_f[w1] = frequency
        if frequency > w2max_right_bigram_f.setdefault(w2, 0):
            w2max_right_bigram_f[w2] = frequency
Esempio n. 4
0
from pathlib import Path
import random

from babeval.vocab import get_vocab

NUM_SUBJECT_NOUNS_FROM_EACH_LIST = 50  # some number smaller than length of both singular and plural noun lists
NUM_OBJECT_NOUNS_FROM_EACH_LIST = 8  # some number smaller than length of both singular and plural noun lists
NUM_ADJECTIVES = 4
NUM_PREPOSITIONS = 2

template = 'the {} {} [MASK] {} .'

nouns_plural = (Path(__file__).parent / 'word_lists' /
                'nouns_plural_annotator2.txt').open().read().split()
nouns_plural = [w for w in nouns_plural if w in get_vocab()]

nouns_singular = (Path(__file__).parent / 'word_lists' /
                  'nouns_singular_annotator2.txt').open().read().split()
nouns_singular = [w for w in nouns_singular if w in get_vocab()]

prepositions = (Path(__file__).parent / 'word_lists' /
                'prepositions_annotator2.txt').open().read().split()
prepositions = [w for w in prepositions if w in get_vocab()]

adjectives = (Path(__file__).parent / 'word_lists' /
              'adjectives_annotator2.txt').open().read().split()
adjectives = [w for w in adjectives if w in get_vocab()]


def main():
    """
Esempio n. 5
0
from pathlib import Path
import random

from babeval.vocab import get_vocab

NUM_NOUNS_FROM_EACH_LIST = 50  # there are 414 plurals
NUM_ADJECTIVES = 20

template = 'look at [MASK] {} {} .'

adjectives = (Path(__file__).parent / 'word_lists' /
              'adjectives_annotator1.txt').open().read().split()
adjectives = [w for w in adjectives if w in get_vocab()]

nouns_plural = (Path(__file__).parent / 'word_lists' /
                'nouns_plural_annotator2.txt').open().read().split()
nouns_plural = [w for w in nouns_plural if w in get_vocab()]

nouns_singular = (Path(__file__).parent / 'word_lists' /
                  'nouns_singular_annotator2.txt').open().read().split()
nouns_singular = [w for w in nouns_singular if w in get_vocab()]


def main():
    """
    example:
    "look at [MASK] green sock"
    """

    random.seed(3)
Esempio n. 6
0
from babeval.vocab import get_vocab

NUM_NOUNS_FROM_EACH_LIST = 50  # there are 414 plurals
NUM_ADJECTIVES = 10

# object-relative clause
template1a = 'the {} that {} like [MASK] {} .'
template1b = 'the {} that {} likes [MASK] {} .'
# subject-relative clause - contains hint about number in relative clause
template2a = 'the {} that is there [MASK] {} .'
template2b = 'the {} that are there [MASK] {} .'

nouns_plural = (Path(__file__).parent / 'word_lists' /
                'nouns_plural_annotator2.txt').open().read().split()
nouns_plural = [w for w in nouns_plural if w in get_vocab()]

nouns_singular = (Path(__file__).parent / 'word_lists' /
                  'nouns_singular_annotator2.txt').open().read().split()
nouns_singular = [w for w in nouns_singular if w in get_vocab()]

adjectives = (Path(__file__).parent / 'word_lists' /
              'adjectives_annotator2.txt').open().read().split()
adjectives = [w for w in adjectives if w in get_vocab()]

pronouns_1p_2p = ['I', 'you', 'we']
pronouns_1p_2p = [w for w in pronouns_1p_2p if w in get_vocab()]

pronouns_3p = ['he', 'she', 'it']
pronouns_3p = [w for w in pronouns_3p if w in get_vocab()]
Esempio n. 7
0
from pathlib import Path
import random

from babeval.vocab import get_vocab

template = 'look at {} {} [MASK] .'

pre_nominals = ['this', 'these', 'that', 'those']

adjectives_list = (Path(__file__).parent / 'word_lists' /
                   'adjectives_annotator1.txt').open().read().split()
adjectives_list = [w for w in adjectives_list if w in get_vocab()]


def main():
    """
    example:
    "look at this green [MASK].
    """

    random.seed(3)

    for pre_nominal in pre_nominals:

        al1 = random.sample(adjectives_list, k=len(adjectives_list))
        al2 = random.sample(adjectives_list, k=len(adjectives_list))
        al3 = random.sample(adjectives_list, k=len(adjectives_list))

        for adj1, adj2, adj3 in zip(al1, al2, al3):
            yield template.format(pre_nominal, ' '.join([adj1]))
            yield template.format(pre_nominal, ' '.join([adj1, adj2]))