def generate_probestatisticsretention_fixedfiller(
        percentage_train_indistribution=100,
        filler_distribution="add05",
        normalize_fillerdistribution=True):
    """
    NOTE: "outofdistribution", "indistribution", "middistribution" is according to special filler distribution.
    """
    print("********************************************************")
    print(
        "generating probe statistics retention fixed filler with percentage_train_indistribution={percentage_train_indistribution}"
        .format(
            percentage_train_indistribution=percentage_train_indistribution))
    NUM_PERSONS_PER_CATEGORY = 1000
    NUM_DIMS = 50
    NUM_TRAIN_EXAMPLES = 24000
    NUM_TEST_EXAMPLES = 120
    NUM_UNSEEN_TEST_EXAMPLES = 120
    SAVE_PATH = os.path.join(
        "/", "home", "cc27", "Thesis", "generalized_schema_learning", "data",
        "probestatisticsretention_percentageindistribution{percentage_train_indistribution}_normalizefillerdistribution{normalize_fillerdistribution}"
        .format(
            percentage_train_indistribution=percentage_train_indistribution,
            normalize_fillerdistribution=normalize_fillerdistribution))
    print("Saving to {save_path}".format(save_path=SAVE_PATH))
    STORY_FRAME = "begin subject sit subject friend announce emcee perform poet consume dessert drink goodbye".split(
        " ")
    QUESTIONS = ["QEmcee", "QFriend", "QPoet", "QSubject"]
    ROLES = ["emcee", "friend", "poet", "subject"]
    PADDING_WORD = "zzz"

    num_fillers = len(ROLES)

    # Get numerical representations of fillers.
    num_fillers_per_category = NUM_PERSONS_PER_CATEGORY * num_fillers
    fillers_indistribution = [i for i in range(num_fillers_per_category * 2)]
    fillers_training = fillers_indistribution[:int(num_fillers_per_category)]
    fillers_indistribution_unseen = fillers_indistribution[
        int(num_fillers_per_category):]
    fillers_outofdistribution_unseen = [
        -1 * i for i in range(1, num_fillers_per_category + 1)
    ]
    fillers_middistribution_unseen = [
        i + np.max(fillers_indistribution)
        for i in range(num_fillers_per_category)
    ]
    fillers_indistribution = [int(i) for i in fillers_indistribution]
    fillers_outofdistribution_unseen = [
        int(i) for i in fillers_outofdistribution_unseen
    ]
    fillers_middistribution_unseen = [
        int(i) for i in fillers_middistribution_unseen
    ]
    print("fillers_training", fillers_training)
    print("fillers_indistribution_unseen", fillers_indistribution_unseen)
    print("fillers_outofdistribution_unseen", fillers_outofdistribution_unseen)
    print("fillers_middistribution_indices_unseen",
          fillers_middistribution_unseen)

    # Get wordslist.
    wordslist = list(STORY_FRAME + QUESTIONS + fillers_indistribution +
                     fillers_outofdistribution_unseen +
                     fillers_middistribution_unseen)
    wordslist.append(PADDING_WORD)
    story_frame_matrix = np.expand_dims(np.expand_dims(np.array(
        [wordslist.index(word) for word in STORY_FRAME]),
                                                       axis=1),
                                        axis=0)
    padding = np.reshape(np.array([wordslist.index(PADDING_WORD)]), (1, 1, 1))

    # Get wordslist indices of fillers.
    fillers_indices_training = [
        wordslist.index(filler) for filler in fillers_training
    ]
    fillers_indistribution_indices_unseen = [
        wordslist.index(filler) for filler in fillers_indistribution_unseen
    ]
    fillers_outofdistribution_indices_unseen = [
        wordslist.index(filler) for filler in fillers_outofdistribution_unseen
    ]
    fillers_middistribution_indices_unseen = [
        wordslist.index(filler) for filler in fillers_middistribution_unseen
    ]

    # Get fillers used in each role during training.
    fillers_bytrainmissingrole = dict()
    for i in range(num_fillers):
        fillers_bytrainmissingrole[ROLES[i]] = fillers_indices_training[
            i * NUM_PERSONS_PER_CATEGORY:(i + 1) * NUM_PERSONS_PER_CATEGORY]

    fillers_bytrainrole = dict()
    for i in range(num_fillers):
        role = ROLES[i]
        fillers_bytrainrole[role] = np.array(
            list(
                set(fillers_indices_training) -
                set(fillers_bytrainmissingrole[role])))

    # Get indices of certain words in wordslist and in story.
    role_wordindices = dict()
    for role in ROLES:
        role_wordindices[role] = wordslist.index(role)

    role_storyindices = dict()
    for role in ROLES:
        role_storyindices[role] = np.where(
            np.squeeze(story_frame_matrix) == role_wordindices[role])[0]

    question_wordindices = [
        wordslist.index(question) for question in QUESTIONS
    ]
    question_storyindices = {
        question: STORY_FRAME.index(role)
        for question, role in zip(question_wordindices, ROLES)
    }

    train_X = np.empty(
        (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
    train_y = np.empty((0, 1))
    for i in range(NUM_TRAIN_EXAMPLES):
        story = np.copy(story_frame_matrix)
        for role in ROLES:
            filler = np.random.choice(fillers_bytrainrole[role])
            story[0, role_storyindices[role], 0] = filler
        question = np.random.choice(question_wordindices)
        answer = [story.squeeze()[question_storyindices[question]]]
        story = np.concatenate(
            (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
        train_X = np.concatenate((train_X, story), axis=0)
        train_y = np.concatenate((train_y, np.reshape(np.array(answer),
                                                      (1, 1))),
                                 axis=0)
    if not os.path.exists(SAVE_PATH):
        os.makedirs(SAVE_PATH)

    with open(os.path.join(SAVE_PATH, "train.p"), "wb") as f:
        pickle.dump([train_X, train_y], f)

    # Generate test set with excluded role-filler pairs.
    test_X = np.empty(
        (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
    test_y = np.empty((0, 1))
    for i in range(NUM_TEST_EXAMPLES):
        story = np.copy(story_frame_matrix)
        for role in ROLES:
            filler = np.random.choice(fillers_bytrainmissingrole[role])
            story[0, role_storyindices[role], 0] = filler
        question = np.random.choice(question_wordindices)
        answer = [story.squeeze()[question_storyindices[question]]]
        story = np.concatenate(
            (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
        test_X = np.concatenate((test_X, story), axis=0)
        test_y = np.concatenate((test_y, np.reshape(np.array(answer), (1, 1))),
                                axis=0)

    with open(os.path.join(SAVE_PATH, "test.p"), "wb") as f:
        pickle.dump([test_X, test_y], f)

    # Generate split test set with excluded role-filler pairs.
    for question in question_wordindices:
        split_test_X = np.empty(
            (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
        split_test_y = np.empty((0, 1))
        for i in range(NUM_TEST_EXAMPLES):
            story = np.copy(story_frame_matrix)
            for role in ROLES:
                filler = np.random.choice(fillers_bytrainmissingrole[role])
                story[0, role_storyindices[role], 0] = filler
            answer = [story.squeeze()[question_storyindices[question]]]
            story = np.concatenate(
                (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
            split_test_X = np.concatenate((split_test_X, story), axis=0)
            split_test_y = np.concatenate(
                (split_test_y, np.reshape(np.array(answer), (1, 1))), axis=0)
        print(wordslist[question], np.unique(split_test_y))
        with open(os.path.join(SAVE_PATH, "test_%s.p" % wordslist[question]),
                  "wb") as f:
            pickle.dump([split_test_X, split_test_y], f)

    # Generate test set with unseen, in distribution fillers.
    for question in question_wordindices:
        split_testunseen_X = np.empty(
            (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
        split_testunseen_y = np.empty((0, 1))
        for i in range(NUM_UNSEEN_TEST_EXAMPLES):
            story = np.copy(story_frame_matrix)
            for role in ROLES:
                filler = np.random.choice(
                    fillers_indistribution_indices_unseen)
                story[0, role_storyindices[role], 0] = filler
            answer = [story.squeeze()[question_storyindices[question]]]
            story = np.concatenate(
                (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
            split_testunseen_X = np.concatenate((split_testunseen_X, story),
                                                axis=0)
            split_testunseen_y = np.concatenate(
                (split_testunseen_y, np.reshape(np.array(answer), (1, 1))),
                axis=0)
        with open(
                os.path.join(
                    SAVE_PATH,
                    "test_%s_unseen_indistribution.p" % wordslist[question]),
                "wb") as f:
            pickle.dump([split_testunseen_X, split_testunseen_y], f)

    # Generate test set with unseen, out of distribution fillers.
    for question in question_wordindices:
        split_testunseen_X = np.empty(
            (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
        split_testunseen_y = np.empty((0, 1))
        for i in range(NUM_UNSEEN_TEST_EXAMPLES):
            story = np.copy(story_frame_matrix)
            for role in ROLES:
                filler = np.random.choice(
                    fillers_outofdistribution_indices_unseen)
                story[0, role_storyindices[role], 0] = filler
            answer = [story.squeeze()[question_storyindices[question]]]
            story = np.concatenate(
                (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
            split_testunseen_X = np.concatenate((split_testunseen_X, story),
                                                axis=0)
            split_testunseen_y = np.concatenate(
                (split_testunseen_y, np.reshape(np.array(answer), (1, 1))),
                axis=0)
        with open(
                os.path.join(
                    SAVE_PATH, "test_%s_unseen_outofdistribution.p" %
                    wordslist[question]), "wb") as f:
            pickle.dump([split_testunseen_X, split_testunseen_y], f)

    # Generate test set with unseen, mid distribution fillers.
    for question in question_wordindices:
        split_testunseen_X = np.empty(
            (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
        split_testunseen_y = np.empty((0, 1))
        for i in range(NUM_UNSEEN_TEST_EXAMPLES):
            story = np.copy(story_frame_matrix)
            for role in ROLES:
                filler = np.random.choice(
                    fillers_middistribution_indices_unseen)
                story[0, role_storyindices[role], 0] = filler
            answer = [story.squeeze()[question_storyindices[question]]]
            story = np.concatenate(
                (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
            split_testunseen_X = np.concatenate((split_testunseen_X, story),
                                                axis=0)
            split_testunseen_y = np.concatenate(
                (split_testunseen_y, np.reshape(np.array(answer), (1, 1))),
                axis=0)
        with open(
                os.path.join(
                    SAVE_PATH,
                    "test_%s_unseen_middistribution.p" % wordslist[question]),
                "wb") as f:
            pickle.dump([split_testunseen_X, split_testunseen_y], f)

    # Generate embedding.
    embedding = []
    fillers_training_indistribution = []
    fillers_training_outofdistribution = []
    for i in range(num_fillers):
        fillers_training_subset = fillers_training[i *
                                                   NUM_PERSONS_PER_CATEGORY:
                                                   (i + 1) *
                                                   NUM_PERSONS_PER_CATEGORY]
        fillers_training_indistribution += fillers_training_subset[:int(
            NUM_PERSONS_PER_CATEGORY * percentage_train_indistribution /
            100.0)]
        fillers_training_outofdistribution += fillers_training_subset[
            int(NUM_PERSONS_PER_CATEGORY * percentage_train_indistribution /
                100.0):]
    print("fillers training indistribution", fillers_training_indistribution)
    print("fillers_training_outofdistribution",
          fillers_training_outofdistribution)

    for i in range(len(wordslist)):
        word = wordslist[i]
        word_embedding = {}
        word_embedding['index'] = i
        word_embedding['word'] = word
        if word in fillers_training_indistribution:
            print(word, "train filler, in distribution")
            word_embedding['vector'] = create_word_vector(
                "add05",
                normalize_fillerdistribution=normalize_fillerdistribution)
        elif word in fillers_training_outofdistribution:
            print(word, "train filler, out of distribution")
            word_embedding['vector'] = create_word_vector()
        elif word in fillers_indistribution_unseen:
            print(word, "in distribution")
            word_embedding['vector'] = create_word_vector(
                "add05",
                normalize_fillerdistribution=normalize_fillerdistribution)
        elif word in fillers_middistribution_unseen:
            print(word, "mid distribution")
            word_embedding['vector'] = create_word_vector(
                "add025",
                normalize_fillerdistribution=normalize_fillerdistribution)
        else:
            word_embedding['vector'] = create_word_vector()
        embedding.append(word_embedding)

    with open(os.path.join(SAVE_PATH, "embedding.p"), "wb") as f:
        pickle.dump(embedding, f)

    with open(os.path.join(SAVE_PATH, "wordslist.p"), "wb") as f:
        pickle.dump(wordslist, f)
Example #2
0
def generate_experiments(num_dims=50,
                         num_train_examples=12000,
                         num_test_examples=120,
                         num_train_fillers_per_category=10000,
                         num_test_fillers_per_category=1000,
                         normalize_filler_distribution=True,
                         dims=50,
                         percentage=80):
    save_dir = os.path.join(
        "/", "home", "cc27", "Thesis", "generalized_schema_learning", "data",
        "probe_role_statistic_recall_normalize_%d" % percentage)
    print("Saving to {save_dir}".format(save_dir=save_dir))
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    STORY_FRAME = "begin subject sit subject friend announce emcee perform poet consume dessert drink goodbye".split(
        " ")
    ROLES = ["emcee", "friend", "poet", "subject"]
    NOISE_WORD = "zzz"

    questions = [get_question_for_role(role) for role in ROLES]

    # Get wordlist.
    add_05_fillers_train = [
        'add_05_train' + str(filler_index)
        for filler_index in range(num_train_fillers_per_category)
    ]
    add_05_fillers_test = [
        'add_05_test' + str(filler_index)
        for filler_index in range(num_test_fillers_per_category)
    ]
    add_05_fillers = add_05_fillers_train + add_05_fillers_test

    subtract_05_fillers_train = [
        'subtract_05_train' + str(filler_index)
        for filler_index in range(num_train_fillers_per_category)
    ]
    subtract_05_fillers_test = [
        'subtract_05_test' + str(filler_index)
        for filler_index in range(num_test_fillers_per_category)
    ]
    subtract_05_fillers = subtract_05_fillers_train + subtract_05_fillers_test
    no_addition_fillers = [
        str(filler_index)
        for filler_index in range(num_test_fillers_per_category)
    ]
    fillers = add_05_fillers + subtract_05_fillers + no_addition_fillers
    wordlist = list(STORY_FRAME + questions + fillers)
    wordlist.append(NOISE_WORD)
    story_frame_matrix = np.expand_dims(np.expand_dims(np.array(
        [wordlist.index(word) for word in STORY_FRAME]),
                                                       axis=1),
                                        axis=0)

    distribution_A_fillers_train = list(
        np.random.choice(
            add_05_fillers_train,
            size=num_train_fillers_per_category * percentage // 100,
            replace=False)) + list(
                np.random.choice(subtract_05_fillers_train,
                                 size=num_train_fillers_per_category *
                                 (100 - percentage) // 100,
                                 replace=False))
    distribution_A_fillers_test = list(
        np.random.choice(
            add_05_fillers_test,
            size=num_test_fillers_per_category * percentage // 100,
            replace=False)) + list(
                np.random.choice(subtract_05_fillers_test,
                                 size=num_test_fillers_per_category *
                                 (100 - percentage) // 100,
                                 replace=False))
    distribution_B_fillers_train = list(
        np.random.choice(
            subtract_05_fillers_train,
            size=num_train_fillers_per_category * percentage // 100,
            replace=False)) + list(
                np.random.choice(add_05_fillers_train,
                                 size=num_train_fillers_per_category *
                                 (100 - percentage) // 100,
                                 replace=False))
    distribution_B_fillers_test = list(
        np.random.choice(
            subtract_05_fillers_test,
            size=num_test_fillers_per_category * percentage // 100,
            replace=False)) + list(
                np.random.choice(add_05_fillers_test,
                                 size=num_test_fillers_per_category *
                                 (100 - percentage) // 100,
                                 replace=False))

    # Get indices of words.
    role_wordlist_indices = dict()
    for role in ROLES:
        role_wordlist_indices[role] = wordlist.index(role)

    role_story_indices = dict()
    for role in ROLES:
        role_story_indices[role] = np.where(np.array(STORY_FRAME) == role)[0]

    question_wordlist_indices = {
        role: wordlist.index(get_question_for_role(role))
        for role in ROLES
    }
    noise_wordlist_index = wordlist.index(NOISE_WORD)
    distribution_A_fillers_indices_train = [
        wordlist.index(filler) for filler in distribution_A_fillers_train
    ]
    distribution_B_fillers_indices_train = [
        wordlist.index(filler) for filler in distribution_B_fillers_train
    ]
    distribution_A_fillers_indices_test = [
        wordlist.index(filler) for filler in distribution_A_fillers_test
    ]
    distribution_B_fillers_indices_test = [
        wordlist.index(filler) for filler in distribution_B_fillers_test
    ]
    no_addition_fillers_indices = [
        wordlist.index(filler) for filler in no_addition_fillers
    ]

    # Generate training set.
    train_fillers_by_role_dict = {
        'emcee': distribution_B_fillers_indices_train,
        'friend': distribution_B_fillers_indices_train,
        'poet': distribution_A_fillers_indices_train,
        'subject': distribution_A_fillers_indices_train,
    }
    write_examples(fillers_by_role_dict=train_fillers_by_role_dict,
                   story_frame_matrix=story_frame_matrix,
                   num_examples=num_train_examples,
                   roles=ROLES,
                   role_story_indices=role_story_indices,
                   question_wordlist_indices=question_wordlist_indices,
                   noise_wordlist_index=noise_wordlist_index,
                   save_path=os.path.join(save_dir, "train.p"))

    # Generate in distribution test set.
    test_fillers_by_role_dict = {
        'emcee': distribution_B_fillers_indices_test,
        'friend': distribution_B_fillers_indices_test,
        'poet': distribution_A_fillers_indices_test,
        'subject': distribution_A_fillers_indices_test,
    }
    write_examples(fillers_by_role_dict=test_fillers_by_role_dict,
                   story_frame_matrix=story_frame_matrix,
                   num_examples=num_test_examples,
                   roles=ROLES,
                   role_story_indices=role_story_indices,
                   question_wordlist_indices=question_wordlist_indices,
                   noise_wordlist_index=noise_wordlist_index,
                   save_path=os.path.join(save_dir, "test.p"))

    # Generate out of distribution test set. (draw each filler from the opposite test pool)
    test_flipped_distribution_fillers_by_role_dict = {
        'emcee': distribution_A_fillers_indices_train,
        'friend': distribution_A_fillers_indices_train,
        'poet': distribution_B_fillers_indices_train,
        'subject': distribution_B_fillers_indices_train,
    }
    write_examples(
        fillers_by_role_dict=test_flipped_distribution_fillers_by_role_dict,
        story_frame_matrix=story_frame_matrix,
        num_examples=num_test_examples,
        roles=ROLES,
        role_story_indices=role_story_indices,
        question_wordlist_indices=question_wordlist_indices,
        noise_wordlist_index=noise_wordlist_index,
        save_path=os.path.join(save_dir, "test_flipped_distribution.p"))

    # Generate unseen flipped_distribution test set.
    unseen_flipped_distribution_fillers_by_role_dict = {
        'emcee': distribution_A_fillers_indices_test,
        'friend': distribution_A_fillers_indices_test,
        'poet': distribution_B_fillers_indices_test,
        'subject': distribution_B_fillers_indices_test,
    }
    write_examples(
        fillers_by_role_dict=unseen_flipped_distribution_fillers_by_role_dict,
        story_frame_matrix=story_frame_matrix,
        num_examples=num_test_examples,
        roles=ROLES,
        role_story_indices=role_story_indices,
        question_wordlist_indices=question_wordlist_indices,
        noise_wordlist_index=noise_wordlist_index,
        save_path=os.path.join(save_dir, "test_unseen_flipped_distribution.p"))

    # Generate unseen ood test set.
    unseen_no_addition_fillers_by_role_dict = {
        'emcee': no_addition_fillers_indices,
        'friend': no_addition_fillers_indices,
        'poet': no_addition_fillers_indices,
        'subject': no_addition_fillers_indices,
    }
    write_examples(
        fillers_by_role_dict=unseen_no_addition_fillers_by_role_dict,
        story_frame_matrix=story_frame_matrix,
        num_examples=num_test_examples,
        roles=ROLES,
        role_story_indices=role_story_indices,
        question_wordlist_indices=question_wordlist_indices,
        noise_wordlist_index=noise_wordlist_index,
        save_path=os.path.join(save_dir,
                               "test_unseen_no_addition_distribution.p"))

    # Generate ambiguous test set. (replace the queried filler with padding)
    write_examples(fillers_by_role_dict=train_fillers_by_role_dict,
                   story_frame_matrix=story_frame_matrix,
                   num_examples=num_test_examples,
                   roles=ROLES,
                   role_story_indices=role_story_indices,
                   question_wordlist_indices=question_wordlist_indices,
                   noise_wordlist_index=noise_wordlist_index,
                   save_path=os.path.join(save_dir,
                                          "test_ambiguous_queried_role.p"),
                   ambiguous='queried_role')

    # Generate fully ambiguous test set. (replace the entire story frame with padding)
    write_examples(fillers_by_role_dict=train_fillers_by_role_dict,
                   story_frame_matrix=story_frame_matrix,
                   num_examples=num_test_examples,
                   roles=ROLES,
                   role_story_indices=role_story_indices,
                   question_wordlist_indices=question_wordlist_indices,
                   noise_wordlist_index=noise_wordlist_index,
                   save_path=os.path.join(save_dir, "test_ambiguous_all.p"),
                   ambiguous='all')

    # Generate embedding.
    embedding = []

    for i in range(len(wordlist)):
        word = wordlist[i]
        word_embedding = {}
        word_embedding['index'] = i
        word_embedding['word'] = word
        if "add_05" in word:
            word_embedding['vector'] = create_word_vector(
                "add05",
                normalize_filler_distribution=normalize_filler_distribution)
        elif "subtract_05" in word:
            word_embedding['vector'] = create_word_vector(
                "subtract05",
                normalize_filler_distribution=normalize_filler_distribution)
        elif word == 'zzz':
            print('generating zzz vector')
            word_embedding['vector'] = np.zeros(dims)
        else:
            word_embedding['vector'] = create_word_vector()
        embedding.append(word_embedding)

    with open(os.path.join(save_dir, "embedding.p"), "wb") as f:
        pickle.dump(embedding, f)

    with open(os.path.join(save_dir, "wordlist.p"), "wb") as f:
        pickle.dump(wordlist, f)
def generate_train3roles_testnewrole(num_persons_per_category,
                                     filler_distribution=None):
    NUM_TRAIN_EXAMPLES = 24000
    NUM_TEST_EXAMPLES = 120
    NUM_UNSEEN_TEST_EXAMPLES = 120
    NUM_UNSEEN_FILLERS = 100
    SAVE_PATH = os.path.join(
        "/", "home", "cc27", "Thesis", "generalized_schema_learning", "data",
        "generate_train3roles_testnewrole_withunseentestfillers_%dpersonspercategory_%dtrain_%dtest"
        % (num_persons_per_category, NUM_TRAIN_EXAMPLES, NUM_TEST_EXAMPLES))
    if not os.path.exists(SAVE_PATH):
        os.mkdir(SAVE_PATH)

    STORY_FRAME = "begin subject sit subject friend announce emcee perform poet consume dessert drink goodbye".split(
        " ")
    QUESTIONS = ["QEmcee", "QFriend", "QPoet", "QSubject"]
    ROLES = ["emcee", "friend", "poet", "subject", "dessert", "drink"]
    PADDING_WORD = "zzz"
    NUM_PERSON_FILLERS = 4
    filler_indices = []
    for role in ROLES:
        filler_indices += list(np.where(np.array(STORY_FRAME) == role)[0])

    person_fillers = [
        str(i) for i in range(num_persons_per_category * NUM_PERSON_FILLERS)
    ]
    person_fillers_unseenintraining = [
        str(-1 * i) for i in range(1, NUM_UNSEEN_FILLERS + 1)
    ]

    wordslist = list(
        set(STORY_FRAME + QUESTIONS + person_fillers +
            person_fillers_unseenintraining))
    wordslist.append(PADDING_WORD)
    story_frame_matrix = np.expand_dims(np.expand_dims(np.array(
        [wordslist.index(word) for word in STORY_FRAME]),
                                                       axis=1),
                                        axis=0)

    person_fillers_indices = [
        wordslist.index(filler) for filler in person_fillers
    ]
    person_fillers_unseenintraining_indices = [
        wordslist.index(filler) for filler in person_fillers_unseenintraining
    ]
    person_fillers_bytrainmissingrole = {}
    for i in range(NUM_PERSON_FILLERS):
        person_fillers_bytrainmissingrole[ROLES[i]] = person_fillers_indices[
            i * num_persons_per_category:(i + 1) * num_persons_per_category]

    person_fillers_bytrainrole = {}
    for i in range(NUM_PERSON_FILLERS):
        role = ROLES[i]
        person_fillers_bytrainrole[role] = np.array(
            list(
                set(person_fillers_indices) -
                set(person_fillers_bytrainmissingrole[role])))

    # Generate train data.
    PERSON_ROLES = ["emcee", "friend", "poet", "subject"]
    person_wordindices = {}
    for role in PERSON_ROLES:
        person_wordindices[role] = wordslist.index(role)

    person_storyindices = {}
    for role in PERSON_ROLES:
        person_storyindices[role] = np.where(
            np.squeeze(story_frame_matrix) == person_wordindices[role])[0]

    role_wordindices = {}
    for role in ROLES:
        role_wordindices[role] = wordslist.index(role)

    role_storyindices = {}
    for role in ROLES:
        role_storyindices[role] = np.where(
            np.squeeze(story_frame_matrix) == role_wordindices[role])[0]

    question_wordindices = [
        wordslist.index(question) for question in QUESTIONS
    ]
    question_storyindices = {
        question: STORY_FRAME.index(role)
        for question, role in zip(question_wordindices, ROLES)
    }
    padding = np.reshape(np.array([wordslist.index(PADDING_WORD)]), (1, 1, 1))
    train_X = np.empty(
        (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
    train_y = np.empty((0, 1))
    for i in range(NUM_TRAIN_EXAMPLES):
        story = np.copy(story_frame_matrix)
        for role in PERSON_ROLES:
            filler = np.random.choice(person_fillers_bytrainrole[role])
            story[0, person_storyindices[role], 0] = filler
        question = np.random.choice(question_wordindices)
        answer = [story.squeeze()[question_storyindices[question]]]
        story = np.concatenate(
            (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
        train_X = np.concatenate((train_X, story), axis=0)
        train_y = np.concatenate((train_y, np.reshape(np.array(answer),
                                                      (1, 1))),
                                 axis=0)

    with open(os.path.join(SAVE_PATH, "train.p"), "wb") as f:
        pickle.dump([train_X, train_y], f)

    test_X = np.empty(
        (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
    test_y = np.empty((0, 1))
    for i in range(NUM_TEST_EXAMPLES):
        story = np.copy(story_frame_matrix)
        for role in PERSON_ROLES:
            filler = np.random.choice(person_fillers_bytrainmissingrole[role])
            story[0, person_storyindices[role], 0] = filler
        question = np.random.choice(question_wordindices)
        answer = [story.squeeze()[question_storyindices[question]]]
        story = np.concatenate(
            (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
        test_X = np.concatenate((test_X, story), axis=0)
        test_y = np.concatenate((test_y, np.reshape(np.array(answer), (1, 1))),
                                axis=0)

    with open(os.path.join(SAVE_PATH, "test.p"), "wb") as f:
        pickle.dump([test_X, test_y], f)

    for question in question_wordindices:
        split_testunseen_X = np.empty(
            (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2]))
        split_testunseen_y = np.empty((0, 1))
        for i in range(NUM_UNSEEN_TEST_EXAMPLES):
            story = np.copy(story_frame_matrix)
            for role in ROLES:
                filler = np.random.choice(
                    person_fillers_unseenintraining_indices)
                story[0, role_storyindices[role], 0] = filler
            answer = [story.squeeze()[question_storyindices[question]]]
            story = np.concatenate(
                (story, padding, np.reshape(question, (1, 1, 1))), axis=1)
            split_testunseen_X = np.concatenate((split_testunseen_X, story),
                                                axis=0)
            split_testunseen_y = np.concatenate(
                (split_testunseen_y, np.reshape(np.array(answer), (1, 1))),
                axis=0)
        with open(
                os.path.join(SAVE_PATH,
                             "test_%s_unseen.p" % wordslist[question]),
                "wb") as f:
            pickle.dump([split_testunseen_X, split_testunseen_y], f)

    embedding = []

    for i in range(len(wordslist)):
        word = wordslist[i]
        word_embedding = {}
        word_embedding['index'] = i
        word_embedding['word'] = word
        if word in person_fillers or word in person_fillers_unseenintraining:
            print(word, filler_distribution)
            word_embedding['vector'] = create_word_vector(filler_distribution)
        else:
            word_embedding['vector'] = create_word_vector()
        embedding.append(word_embedding)

    with open(os.path.join(SAVE_PATH, "embedding.p"), "wb") as f:
        pickle.dump(embedding, f)

    with open(os.path.join(SAVE_PATH, "wordslist.p"), "wb") as f:
        pickle.dump(wordslist, f)
def generate_onefillerperrole():
    NUM_TRAIN_EXAMPLES = 24000
    NUM_TEST_EXAMPLES = 120
    NUM_DIMS = 50
    SAVE_PATH = os.path.join("/", "home", "cc27", "Thesis", "generalized_schema_learning", "data", "storyv2_train20000_AllQs")

    STORY_FRAME = "begin subject sit subject friend announce emcee perform poet consume dessert drink goodbye".split(" ")
    QUESTIONS = ["QEmcee", "QFriend", "QPoet", "QSubject", "QDessert", "QDrink"]
    ROLES = ["emcee", "friend", "poet", "subject", "dessert", "drink"]
    PADDING_WORD = "zzz"
    filler_indices = []
    for role in ROLES:
        filler_indices += list(np.where(np.array(STORY_FRAME) == role)[0])
    num_questions = len(QUESTIONS)
    wordslist = list(set(STORY_FRAME + QUESTIONS))
    wordslist.append(PADDING_WORD)
    story_frame_matrix = np.expand_dims(np.expand_dims(np.array([wordslist.index(word) for word in STORY_FRAME]), axis=1), axis=0)

    def generate_data(num_examples, questions, roles):
        num_questions = len(questions)
        stories = np.repeat(story_frame_matrix, num_questions, axis=0)
        padding = np.reshape(np.repeat([wordslist.index(PADDING_WORD)], num_questions), (num_questions, 1, 1))
        queries = np.reshape(np.array([wordslist.index(question) for question in questions]), (num_questions, 1, 1))
        stories = np.concatenate((stories, padding, queries), axis=1)
        answers = np.reshape(np.array([wordslist.index(role) for role in roles]), (num_questions, 1))
        num_repeats = num_examples // num_questions
        stories = np.repeat(stories, num_repeats, axis=0)
        answers = np.repeat(answers, num_repeats, axis=0)
        return stories, answers

    train_X, train_y = generate_data(NUM_TRAIN_EXAMPLES, QUESTIONS, ROLES)
    test_X, test_y = generate_data(NUM_TEST_EXAMPLES, QUESTIONS, ROLES)
    with open(os.path.join(SAVE_PATH, "train.p"), "wb") as f:
        pickle.dump([train_X, train_y], f)

    with open(os.path.join(SAVE_PATH, "test.p"), "wb") as f:
        pickle.dump([test_X, test_y], f)

    for question, role in zip(QUESTIONS, ROLES):
        split_test_X, split_test_y = generate_data(NUM_TEST_EXAMPLES, [question], [role])
        with open(os.path.join(SAVE_PATH, "test_%s.p" % question), "wb") as f:
            pickle.dump([split_test_X, split_test_y], f)

    import sys
    sys.path.append("../")
    from directories import base_dir
    from embedding_util import create_word_vector

    embedding = []

    for i in range(len(wordslist)):
        word = wordslist[i]
        word_embedding = {}
        word_embedding['index'] = i
        word_embedding['word'] = word
        word_embedding['vector'] = create_word_vector()
        embedding.append(word_embedding)

    with open(os.path.join(SAVE_PATH, "embedding.p"), "wb") as f:
        pickle.dump(embedding, f)

    with open(os.path.join(SAVE_PATH, "wordslist.p"), "wb") as f:
        pickle.dump(wordslist, f)
def write_csw_experiment(experiment_name, num_examples_per_frame, num_unseen_examples_per_frame):
    print(experiment_name)
    """Create train and test sets for a role-filler binding experiment.

    Assumes story files have been written by Coffee Shop world.

    Args:
        experiment_name: Name of folder in which stories are stored.
                               Assumes stories are stored in the directory
                               home_dir + "narrative/story/", where home_dir is
                               defined in directories.py.

    Saves (in the directory base_dir + "data/experiment_name/", where
           base_dir is defined in directories.py):
        train.p: A pickle file containing:
                 X: [num_train_examples x num_words_per_story x 1] matrix of train inputs.
                 y: [num_train_examples x 1] matrix of correct train outputs.
        test.p: A pickle file containing:
                 X: [num_test_examples x num_words_per_story x 1] matrix of test inputs.
                 y: [num_test_examples x 1] matrix of correct test outputs.
        Xy_english.txt: A file containing human-readable versions of the inputs,
                        correct outputs, and the word list used in the experiment.
                        (Each X and y matrix represents words by their index in
                        the word list.)
    """
    experiment_name += "_AllQs"
    experiment_data_path = os.path.join(base_dir, "data", experiment_name)
    query_delimiter = "?"
    query_starter = "Q"
    padding_word = "zzz"
    distributions_dict = {"DESSERT": "A",
            "DRINK": "B",
            "EMCEE": "A",
            "FRIEND": "B",
            "POET": "A",
            "SUBJECT": "B"}
    if not os.path.exists(experiment_data_path):
        os.makedirs(experiment_data_path)

    # Create frames.
    with open('story_frame.json', 'r') as f:
        story_frame_info = json.load(f)
    transitions = story_frame_info['transitions']
    state_contents = story_frame_info['state_contents']
    role_types = story_frame_info['role_types']
    state_sequences = construct_all_state_sequences(transitions)
    assert(len(state_sequences) == 24)
    frames = [flatten_arrays([state_contents[state] for state in state_sequence]) for state_sequence in state_sequences]
    num_examples = len(frames) * num_examples_per_frame
    num_unseen_examples = len(frames) * num_unseen_examples_per_frame

    if 'variablefiller' in experiment_name:
        dummy_instances = {role: ['%sFILLER' % role] for role in role_types.keys()}
        train_instances, test_instances = dummy_instances, dummy_instances
    elif 'fixedfiller' in experiment_name:
        train_instances, test_instances = hard_coded_things.fixed_train_instances, hard_coded_things.fixed_test_instances

    query_choices = role_types.keys()
    wordslist = flatten_arrays(state_contents.values()) + flatten_arrays(train_instances.values()) + flatten_arrays(test_instances.values()) + [padding_word, query_delimiter]

    for query_choice in query_choices:
        wordslist.append(query_starter + query_choice)
    wordslist = list(set(wordslist))

    # Determine experiment information.
    max_story_length = max([len(frame) for frame in frames])
    input_dims = max_story_length + 3  # +2 for query delimiter and the actual query. +1 for padding at end.

    X = np.zeros([num_examples, input_dims, 1], dtype=np.int32)
    y = np.zeros([num_examples, 1], dtype=np.int32)
    test_unseen_X = np.zeros([num_unseen_examples, input_dims, 1], dtype=np.int32)
    test_unseen_y = np.zeros([num_unseen_examples, 1], dtype=np.int32)

    # Generate inputs and correct outputs from stories.
    for frame_index, frame in enumerate(frames):
        print('Generating for frame ', frame)
        padding_size = max_story_length - len(frame)
        frame_roles = [role for role in role_types.keys() if role in frame]
        for example_index in range(num_examples_per_frame):
            if example_index % 1000 == 0:
                print(example_index)
            story = copy.deepcopy(frame)
            role_assignments = {}
            for role in frame_roles:
                if 'fixedfiller' in experiment_name:
                    role_assignment = np.random.choice(train_instances[role_types[role]])
                    while role_assignment in role_assignments.values():
                        role_assignment = np.random.choice(train_instances[role_types[role]])
                elif 'variablefiller' in experiment_name:
                    role_assignment = '%sFILLER' % role
                role_assignments[role] = role_assignment
            story = [role_assignments[word] if word in role_assignments else word for word in story]
            queried_role = np.random.choice(list(role_assignments.keys()))
            query = query_starter + queried_role
            response = role_assignments[queried_role]

            # If necessary, add padding to end of story (ensures that inputs are all the same length).
            story += [padding_word] * (padding_size + 1)  # so we can shift all stories later.
            story += [query_delimiter, query]
            outputs = [response]

            # Convert to numerical representation and add to X and y.
            data_index = (num_examples_per_frame * frame_index) + example_index
            X[data_index, :, :] = np.expand_dims([wordslist.index(storyword) for storyword in story], axis=1)
            y[data_index, :] = [wordslist.index(output_word) for output_word in outputs]

        if 'fixedfiller' in experiment_name:
            for example_index in range(num_unseen_examples_per_frame):
                story = copy.deepcopy(frame)
                role_assignments = {}
                for role in frame_roles:
                    role_assignment = np.random.choice(test_instances[role_types[role]])
                    while role_assignment in role_assignments.values():
                        role_assignment = np.random.choice(test_instances[role_types[role]])
                    role_assignments[role] = role_assignment
                story = [role_assignments[word] if word in role_assignments else word for word in story]
                queried_role = np.random.choice(list(role_assignments.keys()))
                query = query_starter + queried_role
                response = role_assignments[queried_role]

                # If necessary, add padding to end of story (ensures that inputs are all the same length).
                story += [padding_word] * (padding_size + 1)  # so we can shift all stories later.
                story += [query_delimiter, query]
                outputs = [response]

                # Convert to numerical representation and add to X and y.
                data_index = (num_unseen_examples_per_frame * frame_index) + example_index
                test_unseen_X[data_index, :, :] = np.expand_dims([wordslist.index(storyword) for storyword in story], axis=1)
                test_unseen_y[data_index, :] = [wordslist.index(output_word) for output_word in outputs]

    # Assert no repeated stories.
    X, unique_seen_indices = np.unique(X, axis=0, return_index=True)
    y = y[unique_seen_indices]

    if 'fixedfiller' in experiment_name:
        num_train = int(4 * len(X) / 5)
        train_indices = np.random.choice(len(X), num_train, replace=False)
        test_indices = np.array([idx for idx in range(len(X)) if idx not in train_indices])
        train_X = X[train_indices, :, :]
        train_y = y[train_indices, :]
        test_X = X[test_indices, :, :]
        test_y = y[test_indices, :]
        test_unseen_X, unique_unseen_indices = np.unique(test_unseen_X, axis=0, return_index=True)
        test_unseen_y = test_unseen_y[unique_unseen_indices]
    elif 'variablefiller' in experiment_name:
        train_X, train_y = X, y
        test_X, test_y = X, y
        test_unseen_X, test_unseen_y = X, y

    # Save data into pickle files.
    if not os.path.exists(experiment_data_path):
        os.makedirs(experiment_data_path)
    print(experiment_data_path)
    with open(os.path.join(experiment_data_path, 'train.p'), 'wb') as f:
        pickle.dump([train_X, train_y], f)
    with open(os.path.join(experiment_data_path, 'test.p'), 'wb') as f:
        pickle.dump([test_X, test_y], f)
    with open(os.path.join(experiment_data_path, 'test_unseen.p'), 'wb') as f:
        pickle.dump([test_unseen_X, test_unseen_y], f)
    with open(os.path.join(experiment_data_path, 'wordslist.p'), 'wb') as f:
        pickle.dump(wordslist, f)

    with open('../experiment_parameters.json', 'r') as f:
        experiment_parameters = json.load(f)

    experiment_parameters['input_dims'][experiment_name] = input_dims
    fillers = list(set(flatten_arrays(train_instances.values()) + flatten_arrays(test_instances.values())))
    experiment_parameters['filler_indices'][experiment_name] = [wordslist.index(filler) for filler in fillers]
    experiment_parameters['padding_indices'][experiment_name] = wordslist.index(padding_word)

    if 'variablefiller' in experiment_name:
        experiment_parameters['query_to_filler_index'][experiment_name] = {wordslist.index(query_starter + role): [wordslist.index(filler) for filler in dummy_instances[role]] for role in role_types.keys()}
        filler_distributions_dict = {wordslist.index(dummy_instances[role][0]): distributions_dict[role] for role in role_types.keys()}
        experiment_parameters['filler_distributions'][experiment_name] = filler_distributions_dict

    with open('../experiment_parameters.json', 'w') as f:
        json.dump(experiment_parameters, f)
    embedding = []

    for i in range(len(wordslist)):
        word = wordslist[i]
        word_embedding = {}
        word_embedding['index'] = i
        word_embedding['word'] = word
        word_embedding['vector'] = create_word_vector()
        embedding.append(word_embedding)

    with open(os.path.join(experiment_data_path, "embedding.p"), 'wb') as f:
        pickle.dump(embedding, f)
Example #6
0
def generate_batch(X, y, FLAGS, embedding, do_shift_inputs=True, noise_proportion=0.1, zero_vector_noise=False):
    """Generate a train batch.

    Constructs batches using one of three possible representations (specified by
    FLAGS.filler_type):
        fixed_filler: Each word vector is specified by the embedding argument.
        variable_filler: Each non-filler word vector is
                                          specified by the embedding argument.
                                          Each filler word (manually specified
                                          for each experiment) represented by a
                                          new randomly generated vector in each
                                          story.
    Args:
        X: [num_examples x num_words_per_input x 1] matrix of inputs.
        y: [num_examples x 1] matrix of correct outputs.
        FLAGS: Parameters object of experiment information.
        embedding: [num_words x embedding_dims] matrix of word embeddings.
                   NOTE: irrelevant if using one-hot embedding.
    Returns:
        A generator containing batch_size examples, each of which contains:
            X: [batch_size x num_words_per_input x num_dimensions_per_word] matrix
               of inputs.
            y: [batch_size x num_dimensions_per_word] matrix of correct outputs.
            embedding: [num_words_in_corpus x num_dimensions_per_word] matrix
                       of vectors representing words in the batch.
    """
    batch_size, filler_type = FLAGS.batch_size, FLAGS.filler_type
    data_size = len(X)
    num_batches = (data_size // batch_size)
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if filler_type == "fixed_filler":
            if do_shift_inputs:
                yield embedding[shift_inputs(X[start_index:end_index].squeeze(), FLAGS.experiment_name)], embedding[y[start_index:end_index].squeeze()], embedding
            else:
                yield embedding[X[start_index:end_index]].squeeze(), embedding[y[start_index:end_index].squeeze()], embedding
        elif "variable_filler" in filler_type:
            # NOTE: Filler indices manually determined using word list saved by experiment creators.
            if "distributions" in filler_type:
                filler_indices_and_distributions = experiment_parameters["filler_distributions"][FLAGS.experiment_name]
                query_to_filler_indices = experiment_parameters["query_to_filler_index"][FLAGS.experiment_name]
                filler_indices = list(filler_indices_and_distributions.keys())
                filler_distributions = [filler_indices_and_distributions[filler_index] for filler_index in filler_indices]
                filler_indices = [int(index) for index in filler_indices]
            else:
                filler_indices = experiment_parameters["filler_indices"][FLAGS.experiment_name]
            batchX, batchy = X[start_index:end_index].squeeze(), y[start_index:end_index].squeeze()
            if FLAGS.function != "analyze" and do_shift_inputs:  # Don't randomly shift inputs for decoding analysis.
                batchX = shift_inputs(batchX, FLAGS.experiment_name)
            embeddingX, embeddingy = embedding[batchX], embedding[batchy]
            padding_index = experiment_parameters['padding_indices'][FLAGS.experiment_name]
            padding_vector = embedding[padding_index]
            epoch_embedding = embedding
            for examplenum in range(batch_size):
                # Create new random embedding for each filler.
                num_fillers = len(filler_indices)
                new_filler_embedding = np.empty((num_fillers, embedding_size))
                if "distributions" in filler_type:
                    for j, filler_distribution in enumerate(filler_distributions):
                        if "variable_filler_distributions_no_subtract" in filler_type:
                            new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="C")
                        elif "variable_filler_distributions_one_distribution" in filler_type:
                            new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution=filler_distribution, dominant_distribution_proportion=1)
                        elif "variable_filler_distributions_all_randn_distribution" in filler_type:
                            new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="randn")
                        elif "variable_filler_distributions_A" in filler_type:
                            new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="A", dominant_distribution_proportion=1)
                        elif "variable_filler_distributions_B" in filler_type:
                            new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="B", dominant_distribution_proportion=1)
                        elif "variable_filler_distributions_5050_AB" in filler_type:
                            new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="B", dominant_distribution_proportion=0.5)
                        else:
                            new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution=filler_distribution)
                else:
                    for j in range(num_fillers):
                        new_filler_embedding[j, :] = embedding_util.create_word_vector()
                # Replace filler embedding with new random embedding.
                filler_ix_X = np.where(np.isin(batchX[examplenum], filler_indices))
                new_embedding_ix_X = [filler_indices.index(i) for i in batchX[examplenum, filler_ix_X][0]]
                embeddingX[examplenum, filler_ix_X] = new_filler_embedding[new_embedding_ix_X]
                if "noise" in filler_type:
                    if np.random.rand() < noise_proportion:
                        print('noise trial')
                        queried_filler_index = query_to_filler_indices[str(batchX[examplenum, -1])]
                        queried_filler_indices = np.where(batchX[examplenum] == queried_filler_index)
                        if zero_vector_noise:
                            print('zero vector noise')
                            embeddingX[examplenum, queried_filler_indices] = np.zeros(padding_vector.shape)
                        else:
                            embeddingX[examplenum, queried_filler_indices] = padding_vector
                new_embedding_ix_y = [filler_indices.index(batchy[examplenum])]
                embeddingy[examplenum] = new_filler_embedding[new_embedding_ix_y]
                # Append embedding to original embedding identifying response.
                epoch_embedding = np.concatenate((epoch_embedding, new_filler_embedding), axis=0)
            yield embeddingX, embeddingy, epoch_embedding