import random from pathlib import Path from babeval.vocab import get_vocab NUM_NOUNS_FROM_EACH_LIST = 400 # there are only 414 plurals template1 = 'where [MASK] the {} go ?' template2 = 'what [MASK] the {} do ?' nouns_plural = (Path(__file__).parent / 'word_lists' / 'nouns_plural_annotator2.txt').open().read().split() nouns_plural = [w for w in nouns_plural if w in get_vocab()] nouns_singular = (Path(__file__).parent / 'word_lists' / 'nouns_singular_annotator2.txt').open().read().split() nouns_singular = [w for w in nouns_singular if w in get_vocab()] def main(): random.seed(3) nouns_balanced = random.sample(nouns_singular, k=NUM_NOUNS_FROM_EACH_LIST) + \ random.sample(nouns_plural, k=NUM_NOUNS_FROM_EACH_LIST) for noun in nouns_balanced: yield template1.format(noun) yield template2.format(noun)
""" this script calculates human-machine agreement using cohen's kappa """ from sklearn.metrics import cohen_kappa_score from babeval.vocab import get_vocab, classify_vocab nouns_annotator2 = open('babeval/agreement_across_adjectives/word_lists/nouns_annotator2.txt', 'r').read().split() nouns_singular_nltk = classify_vocab(get_vocab())['nouns_singular'] nouns_singular_ann2 = open('babeval/agreement_across_RC/word_lists/nouns_singular_annotator2.txt', 'r').read().split() y1 = [] y2 = [] for w in nouns_annotator2: y1i = "-" if w in nouns_singular_nltk else "P" y2i = "-" if w in nouns_singular_ann2 else "P" y1.append(y1i) y2.append(y2i) print(f'{w:<16} {y1i} {y2i}') ck = cohen_kappa_score(y1, y2) print(ck)
from typing import List, Dict from babeval import configs from babeval.vocab import get_vocab def to_percentile(val: float): return int(val - (val % 10) + 10) vocab = get_vocab() # load bigrams bigram2percentile = {} bigram2f = {} w2max_left_bigram_f = {} w2max_right_bigram_f = {} left_w2right_w2f = {} right_w2_left_w2f = {} with (configs.Dirs.root / 'word_lists' / 'bi-grams.txt').open() as f: for line in f.readlines(): frequency, w1, w2, percent = line.split() frequency = int(frequency) bigram2percentile[(w1, w2)] = to_percentile(float(percent)) bigram2f[(w1, w2)] = frequency # if frequency > w2max_left_bigram_f.setdefault(w1, 0): w2max_left_bigram_f[w1] = frequency if frequency > w2max_right_bigram_f.setdefault(w2, 0): w2max_right_bigram_f[w2] = frequency
from pathlib import Path import random from babeval.vocab import get_vocab NUM_SUBJECT_NOUNS_FROM_EACH_LIST = 50 # some number smaller than length of both singular and plural noun lists NUM_OBJECT_NOUNS_FROM_EACH_LIST = 8 # some number smaller than length of both singular and plural noun lists NUM_ADJECTIVES = 4 NUM_PREPOSITIONS = 2 template = 'the {} {} [MASK] {} .' nouns_plural = (Path(__file__).parent / 'word_lists' / 'nouns_plural_annotator2.txt').open().read().split() nouns_plural = [w for w in nouns_plural if w in get_vocab()] nouns_singular = (Path(__file__).parent / 'word_lists' / 'nouns_singular_annotator2.txt').open().read().split() nouns_singular = [w for w in nouns_singular if w in get_vocab()] prepositions = (Path(__file__).parent / 'word_lists' / 'prepositions_annotator2.txt').open().read().split() prepositions = [w for w in prepositions if w in get_vocab()] adjectives = (Path(__file__).parent / 'word_lists' / 'adjectives_annotator2.txt').open().read().split() adjectives = [w for w in adjectives if w in get_vocab()] def main(): """
from pathlib import Path import random from babeval.vocab import get_vocab NUM_NOUNS_FROM_EACH_LIST = 50 # there are 414 plurals NUM_ADJECTIVES = 20 template = 'look at [MASK] {} {} .' adjectives = (Path(__file__).parent / 'word_lists' / 'adjectives_annotator1.txt').open().read().split() adjectives = [w for w in adjectives if w in get_vocab()] nouns_plural = (Path(__file__).parent / 'word_lists' / 'nouns_plural_annotator2.txt').open().read().split() nouns_plural = [w for w in nouns_plural if w in get_vocab()] nouns_singular = (Path(__file__).parent / 'word_lists' / 'nouns_singular_annotator2.txt').open().read().split() nouns_singular = [w for w in nouns_singular if w in get_vocab()] def main(): """ example: "look at [MASK] green sock" """ random.seed(3)
from babeval.vocab import get_vocab NUM_NOUNS_FROM_EACH_LIST = 50 # there are 414 plurals NUM_ADJECTIVES = 10 # object-relative clause template1a = 'the {} that {} like [MASK] {} .' template1b = 'the {} that {} likes [MASK] {} .' # subject-relative clause - contains hint about number in relative clause template2a = 'the {} that is there [MASK] {} .' template2b = 'the {} that are there [MASK] {} .' nouns_plural = (Path(__file__).parent / 'word_lists' / 'nouns_plural_annotator2.txt').open().read().split() nouns_plural = [w for w in nouns_plural if w in get_vocab()] nouns_singular = (Path(__file__).parent / 'word_lists' / 'nouns_singular_annotator2.txt').open().read().split() nouns_singular = [w for w in nouns_singular if w in get_vocab()] adjectives = (Path(__file__).parent / 'word_lists' / 'adjectives_annotator2.txt').open().read().split() adjectives = [w for w in adjectives if w in get_vocab()] pronouns_1p_2p = ['I', 'you', 'we'] pronouns_1p_2p = [w for w in pronouns_1p_2p if w in get_vocab()] pronouns_3p = ['he', 'she', 'it'] pronouns_3p = [w for w in pronouns_3p if w in get_vocab()]
from pathlib import Path import random from babeval.vocab import get_vocab template = 'look at {} {} [MASK] .' pre_nominals = ['this', 'these', 'that', 'those'] adjectives_list = (Path(__file__).parent / 'word_lists' / 'adjectives_annotator1.txt').open().read().split() adjectives_list = [w for w in adjectives_list if w in get_vocab()] def main(): """ example: "look at this green [MASK]. """ random.seed(3) for pre_nominal in pre_nominals: al1 = random.sample(adjectives_list, k=len(adjectives_list)) al2 = random.sample(adjectives_list, k=len(adjectives_list)) al3 = random.sample(adjectives_list, k=len(adjectives_list)) for adj1, adj2, adj3 in zip(al1, al2, al3): yield template.format(pre_nominal, ' '.join([adj1])) yield template.format(pre_nominal, ' '.join([adj1, adj2]))