def generate_probestatisticsretention_fixedfiller( percentage_train_indistribution=100, filler_distribution="add05", normalize_fillerdistribution=True): """ NOTE: "outofdistribution", "indistribution", "middistribution" is according to special filler distribution. """ print("********************************************************") print( "generating probe statistics retention fixed filler with percentage_train_indistribution={percentage_train_indistribution}" .format( percentage_train_indistribution=percentage_train_indistribution)) NUM_PERSONS_PER_CATEGORY = 1000 NUM_DIMS = 50 NUM_TRAIN_EXAMPLES = 24000 NUM_TEST_EXAMPLES = 120 NUM_UNSEEN_TEST_EXAMPLES = 120 SAVE_PATH = os.path.join( "/", "home", "cc27", "Thesis", "generalized_schema_learning", "data", "probestatisticsretention_percentageindistribution{percentage_train_indistribution}_normalizefillerdistribution{normalize_fillerdistribution}" .format( percentage_train_indistribution=percentage_train_indistribution, normalize_fillerdistribution=normalize_fillerdistribution)) print("Saving to {save_path}".format(save_path=SAVE_PATH)) STORY_FRAME = "begin subject sit subject friend announce emcee perform poet consume dessert drink goodbye".split( " ") QUESTIONS = ["QEmcee", "QFriend", "QPoet", "QSubject"] ROLES = ["emcee", "friend", "poet", "subject"] PADDING_WORD = "zzz" num_fillers = len(ROLES) # Get numerical representations of fillers. num_fillers_per_category = NUM_PERSONS_PER_CATEGORY * num_fillers fillers_indistribution = [i for i in range(num_fillers_per_category * 2)] fillers_training = fillers_indistribution[:int(num_fillers_per_category)] fillers_indistribution_unseen = fillers_indistribution[ int(num_fillers_per_category):] fillers_outofdistribution_unseen = [ -1 * i for i in range(1, num_fillers_per_category + 1) ] fillers_middistribution_unseen = [ i + np.max(fillers_indistribution) for i in range(num_fillers_per_category) ] fillers_indistribution = [int(i) for i in fillers_indistribution] fillers_outofdistribution_unseen = [ int(i) for i in fillers_outofdistribution_unseen ] fillers_middistribution_unseen = [ int(i) for i in fillers_middistribution_unseen ] print("fillers_training", fillers_training) print("fillers_indistribution_unseen", fillers_indistribution_unseen) print("fillers_outofdistribution_unseen", fillers_outofdistribution_unseen) print("fillers_middistribution_indices_unseen", fillers_middistribution_unseen) # Get wordslist. wordslist = list(STORY_FRAME + QUESTIONS + fillers_indistribution + fillers_outofdistribution_unseen + fillers_middistribution_unseen) wordslist.append(PADDING_WORD) story_frame_matrix = np.expand_dims(np.expand_dims(np.array( [wordslist.index(word) for word in STORY_FRAME]), axis=1), axis=0) padding = np.reshape(np.array([wordslist.index(PADDING_WORD)]), (1, 1, 1)) # Get wordslist indices of fillers. fillers_indices_training = [ wordslist.index(filler) for filler in fillers_training ] fillers_indistribution_indices_unseen = [ wordslist.index(filler) for filler in fillers_indistribution_unseen ] fillers_outofdistribution_indices_unseen = [ wordslist.index(filler) for filler in fillers_outofdistribution_unseen ] fillers_middistribution_indices_unseen = [ wordslist.index(filler) for filler in fillers_middistribution_unseen ] # Get fillers used in each role during training. fillers_bytrainmissingrole = dict() for i in range(num_fillers): fillers_bytrainmissingrole[ROLES[i]] = fillers_indices_training[ i * NUM_PERSONS_PER_CATEGORY:(i + 1) * NUM_PERSONS_PER_CATEGORY] fillers_bytrainrole = dict() for i in range(num_fillers): role = ROLES[i] fillers_bytrainrole[role] = np.array( list( set(fillers_indices_training) - set(fillers_bytrainmissingrole[role]))) # Get indices of certain words in wordslist and in story. role_wordindices = dict() for role in ROLES: role_wordindices[role] = wordslist.index(role) role_storyindices = dict() for role in ROLES: role_storyindices[role] = np.where( np.squeeze(story_frame_matrix) == role_wordindices[role])[0] question_wordindices = [ wordslist.index(question) for question in QUESTIONS ] question_storyindices = { question: STORY_FRAME.index(role) for question, role in zip(question_wordindices, ROLES) } train_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) train_y = np.empty((0, 1)) for i in range(NUM_TRAIN_EXAMPLES): story = np.copy(story_frame_matrix) for role in ROLES: filler = np.random.choice(fillers_bytrainrole[role]) story[0, role_storyindices[role], 0] = filler question = np.random.choice(question_wordindices) answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) train_X = np.concatenate((train_X, story), axis=0) train_y = np.concatenate((train_y, np.reshape(np.array(answer), (1, 1))), axis=0) if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) with open(os.path.join(SAVE_PATH, "train.p"), "wb") as f: pickle.dump([train_X, train_y], f) # Generate test set with excluded role-filler pairs. test_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) test_y = np.empty((0, 1)) for i in range(NUM_TEST_EXAMPLES): story = np.copy(story_frame_matrix) for role in ROLES: filler = np.random.choice(fillers_bytrainmissingrole[role]) story[0, role_storyindices[role], 0] = filler question = np.random.choice(question_wordindices) answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) test_X = np.concatenate((test_X, story), axis=0) test_y = np.concatenate((test_y, np.reshape(np.array(answer), (1, 1))), axis=0) with open(os.path.join(SAVE_PATH, "test.p"), "wb") as f: pickle.dump([test_X, test_y], f) # Generate split test set with excluded role-filler pairs. for question in question_wordindices: split_test_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) split_test_y = np.empty((0, 1)) for i in range(NUM_TEST_EXAMPLES): story = np.copy(story_frame_matrix) for role in ROLES: filler = np.random.choice(fillers_bytrainmissingrole[role]) story[0, role_storyindices[role], 0] = filler answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) split_test_X = np.concatenate((split_test_X, story), axis=0) split_test_y = np.concatenate( (split_test_y, np.reshape(np.array(answer), (1, 1))), axis=0) print(wordslist[question], np.unique(split_test_y)) with open(os.path.join(SAVE_PATH, "test_%s.p" % wordslist[question]), "wb") as f: pickle.dump([split_test_X, split_test_y], f) # Generate test set with unseen, in distribution fillers. for question in question_wordindices: split_testunseen_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) split_testunseen_y = np.empty((0, 1)) for i in range(NUM_UNSEEN_TEST_EXAMPLES): story = np.copy(story_frame_matrix) for role in ROLES: filler = np.random.choice( fillers_indistribution_indices_unseen) story[0, role_storyindices[role], 0] = filler answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) split_testunseen_X = np.concatenate((split_testunseen_X, story), axis=0) split_testunseen_y = np.concatenate( (split_testunseen_y, np.reshape(np.array(answer), (1, 1))), axis=0) with open( os.path.join( SAVE_PATH, "test_%s_unseen_indistribution.p" % wordslist[question]), "wb") as f: pickle.dump([split_testunseen_X, split_testunseen_y], f) # Generate test set with unseen, out of distribution fillers. for question in question_wordindices: split_testunseen_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) split_testunseen_y = np.empty((0, 1)) for i in range(NUM_UNSEEN_TEST_EXAMPLES): story = np.copy(story_frame_matrix) for role in ROLES: filler = np.random.choice( fillers_outofdistribution_indices_unseen) story[0, role_storyindices[role], 0] = filler answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) split_testunseen_X = np.concatenate((split_testunseen_X, story), axis=0) split_testunseen_y = np.concatenate( (split_testunseen_y, np.reshape(np.array(answer), (1, 1))), axis=0) with open( os.path.join( SAVE_PATH, "test_%s_unseen_outofdistribution.p" % wordslist[question]), "wb") as f: pickle.dump([split_testunseen_X, split_testunseen_y], f) # Generate test set with unseen, mid distribution fillers. for question in question_wordindices: split_testunseen_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) split_testunseen_y = np.empty((0, 1)) for i in range(NUM_UNSEEN_TEST_EXAMPLES): story = np.copy(story_frame_matrix) for role in ROLES: filler = np.random.choice( fillers_middistribution_indices_unseen) story[0, role_storyindices[role], 0] = filler answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) split_testunseen_X = np.concatenate((split_testunseen_X, story), axis=0) split_testunseen_y = np.concatenate( (split_testunseen_y, np.reshape(np.array(answer), (1, 1))), axis=0) with open( os.path.join( SAVE_PATH, "test_%s_unseen_middistribution.p" % wordslist[question]), "wb") as f: pickle.dump([split_testunseen_X, split_testunseen_y], f) # Generate embedding. embedding = [] fillers_training_indistribution = [] fillers_training_outofdistribution = [] for i in range(num_fillers): fillers_training_subset = fillers_training[i * NUM_PERSONS_PER_CATEGORY: (i + 1) * NUM_PERSONS_PER_CATEGORY] fillers_training_indistribution += fillers_training_subset[:int( NUM_PERSONS_PER_CATEGORY * percentage_train_indistribution / 100.0)] fillers_training_outofdistribution += fillers_training_subset[ int(NUM_PERSONS_PER_CATEGORY * percentage_train_indistribution / 100.0):] print("fillers training indistribution", fillers_training_indistribution) print("fillers_training_outofdistribution", fillers_training_outofdistribution) for i in range(len(wordslist)): word = wordslist[i] word_embedding = {} word_embedding['index'] = i word_embedding['word'] = word if word in fillers_training_indistribution: print(word, "train filler, in distribution") word_embedding['vector'] = create_word_vector( "add05", normalize_fillerdistribution=normalize_fillerdistribution) elif word in fillers_training_outofdistribution: print(word, "train filler, out of distribution") word_embedding['vector'] = create_word_vector() elif word in fillers_indistribution_unseen: print(word, "in distribution") word_embedding['vector'] = create_word_vector( "add05", normalize_fillerdistribution=normalize_fillerdistribution) elif word in fillers_middistribution_unseen: print(word, "mid distribution") word_embedding['vector'] = create_word_vector( "add025", normalize_fillerdistribution=normalize_fillerdistribution) else: word_embedding['vector'] = create_word_vector() embedding.append(word_embedding) with open(os.path.join(SAVE_PATH, "embedding.p"), "wb") as f: pickle.dump(embedding, f) with open(os.path.join(SAVE_PATH, "wordslist.p"), "wb") as f: pickle.dump(wordslist, f)
def generate_experiments(num_dims=50, num_train_examples=12000, num_test_examples=120, num_train_fillers_per_category=10000, num_test_fillers_per_category=1000, normalize_filler_distribution=True, dims=50, percentage=80): save_dir = os.path.join( "/", "home", "cc27", "Thesis", "generalized_schema_learning", "data", "probe_role_statistic_recall_normalize_%d" % percentage) print("Saving to {save_dir}".format(save_dir=save_dir)) if not os.path.exists(save_dir): os.mkdir(save_dir) STORY_FRAME = "begin subject sit subject friend announce emcee perform poet consume dessert drink goodbye".split( " ") ROLES = ["emcee", "friend", "poet", "subject"] NOISE_WORD = "zzz" questions = [get_question_for_role(role) for role in ROLES] # Get wordlist. add_05_fillers_train = [ 'add_05_train' + str(filler_index) for filler_index in range(num_train_fillers_per_category) ] add_05_fillers_test = [ 'add_05_test' + str(filler_index) for filler_index in range(num_test_fillers_per_category) ] add_05_fillers = add_05_fillers_train + add_05_fillers_test subtract_05_fillers_train = [ 'subtract_05_train' + str(filler_index) for filler_index in range(num_train_fillers_per_category) ] subtract_05_fillers_test = [ 'subtract_05_test' + str(filler_index) for filler_index in range(num_test_fillers_per_category) ] subtract_05_fillers = subtract_05_fillers_train + subtract_05_fillers_test no_addition_fillers = [ str(filler_index) for filler_index in range(num_test_fillers_per_category) ] fillers = add_05_fillers + subtract_05_fillers + no_addition_fillers wordlist = list(STORY_FRAME + questions + fillers) wordlist.append(NOISE_WORD) story_frame_matrix = np.expand_dims(np.expand_dims(np.array( [wordlist.index(word) for word in STORY_FRAME]), axis=1), axis=0) distribution_A_fillers_train = list( np.random.choice( add_05_fillers_train, size=num_train_fillers_per_category * percentage // 100, replace=False)) + list( np.random.choice(subtract_05_fillers_train, size=num_train_fillers_per_category * (100 - percentage) // 100, replace=False)) distribution_A_fillers_test = list( np.random.choice( add_05_fillers_test, size=num_test_fillers_per_category * percentage // 100, replace=False)) + list( np.random.choice(subtract_05_fillers_test, size=num_test_fillers_per_category * (100 - percentage) // 100, replace=False)) distribution_B_fillers_train = list( np.random.choice( subtract_05_fillers_train, size=num_train_fillers_per_category * percentage // 100, replace=False)) + list( np.random.choice(add_05_fillers_train, size=num_train_fillers_per_category * (100 - percentage) // 100, replace=False)) distribution_B_fillers_test = list( np.random.choice( subtract_05_fillers_test, size=num_test_fillers_per_category * percentage // 100, replace=False)) + list( np.random.choice(add_05_fillers_test, size=num_test_fillers_per_category * (100 - percentage) // 100, replace=False)) # Get indices of words. role_wordlist_indices = dict() for role in ROLES: role_wordlist_indices[role] = wordlist.index(role) role_story_indices = dict() for role in ROLES: role_story_indices[role] = np.where(np.array(STORY_FRAME) == role)[0] question_wordlist_indices = { role: wordlist.index(get_question_for_role(role)) for role in ROLES } noise_wordlist_index = wordlist.index(NOISE_WORD) distribution_A_fillers_indices_train = [ wordlist.index(filler) for filler in distribution_A_fillers_train ] distribution_B_fillers_indices_train = [ wordlist.index(filler) for filler in distribution_B_fillers_train ] distribution_A_fillers_indices_test = [ wordlist.index(filler) for filler in distribution_A_fillers_test ] distribution_B_fillers_indices_test = [ wordlist.index(filler) for filler in distribution_B_fillers_test ] no_addition_fillers_indices = [ wordlist.index(filler) for filler in no_addition_fillers ] # Generate training set. train_fillers_by_role_dict = { 'emcee': distribution_B_fillers_indices_train, 'friend': distribution_B_fillers_indices_train, 'poet': distribution_A_fillers_indices_train, 'subject': distribution_A_fillers_indices_train, } write_examples(fillers_by_role_dict=train_fillers_by_role_dict, story_frame_matrix=story_frame_matrix, num_examples=num_train_examples, roles=ROLES, role_story_indices=role_story_indices, question_wordlist_indices=question_wordlist_indices, noise_wordlist_index=noise_wordlist_index, save_path=os.path.join(save_dir, "train.p")) # Generate in distribution test set. test_fillers_by_role_dict = { 'emcee': distribution_B_fillers_indices_test, 'friend': distribution_B_fillers_indices_test, 'poet': distribution_A_fillers_indices_test, 'subject': distribution_A_fillers_indices_test, } write_examples(fillers_by_role_dict=test_fillers_by_role_dict, story_frame_matrix=story_frame_matrix, num_examples=num_test_examples, roles=ROLES, role_story_indices=role_story_indices, question_wordlist_indices=question_wordlist_indices, noise_wordlist_index=noise_wordlist_index, save_path=os.path.join(save_dir, "test.p")) # Generate out of distribution test set. (draw each filler from the opposite test pool) test_flipped_distribution_fillers_by_role_dict = { 'emcee': distribution_A_fillers_indices_train, 'friend': distribution_A_fillers_indices_train, 'poet': distribution_B_fillers_indices_train, 'subject': distribution_B_fillers_indices_train, } write_examples( fillers_by_role_dict=test_flipped_distribution_fillers_by_role_dict, story_frame_matrix=story_frame_matrix, num_examples=num_test_examples, roles=ROLES, role_story_indices=role_story_indices, question_wordlist_indices=question_wordlist_indices, noise_wordlist_index=noise_wordlist_index, save_path=os.path.join(save_dir, "test_flipped_distribution.p")) # Generate unseen flipped_distribution test set. unseen_flipped_distribution_fillers_by_role_dict = { 'emcee': distribution_A_fillers_indices_test, 'friend': distribution_A_fillers_indices_test, 'poet': distribution_B_fillers_indices_test, 'subject': distribution_B_fillers_indices_test, } write_examples( fillers_by_role_dict=unseen_flipped_distribution_fillers_by_role_dict, story_frame_matrix=story_frame_matrix, num_examples=num_test_examples, roles=ROLES, role_story_indices=role_story_indices, question_wordlist_indices=question_wordlist_indices, noise_wordlist_index=noise_wordlist_index, save_path=os.path.join(save_dir, "test_unseen_flipped_distribution.p")) # Generate unseen ood test set. unseen_no_addition_fillers_by_role_dict = { 'emcee': no_addition_fillers_indices, 'friend': no_addition_fillers_indices, 'poet': no_addition_fillers_indices, 'subject': no_addition_fillers_indices, } write_examples( fillers_by_role_dict=unseen_no_addition_fillers_by_role_dict, story_frame_matrix=story_frame_matrix, num_examples=num_test_examples, roles=ROLES, role_story_indices=role_story_indices, question_wordlist_indices=question_wordlist_indices, noise_wordlist_index=noise_wordlist_index, save_path=os.path.join(save_dir, "test_unseen_no_addition_distribution.p")) # Generate ambiguous test set. (replace the queried filler with padding) write_examples(fillers_by_role_dict=train_fillers_by_role_dict, story_frame_matrix=story_frame_matrix, num_examples=num_test_examples, roles=ROLES, role_story_indices=role_story_indices, question_wordlist_indices=question_wordlist_indices, noise_wordlist_index=noise_wordlist_index, save_path=os.path.join(save_dir, "test_ambiguous_queried_role.p"), ambiguous='queried_role') # Generate fully ambiguous test set. (replace the entire story frame with padding) write_examples(fillers_by_role_dict=train_fillers_by_role_dict, story_frame_matrix=story_frame_matrix, num_examples=num_test_examples, roles=ROLES, role_story_indices=role_story_indices, question_wordlist_indices=question_wordlist_indices, noise_wordlist_index=noise_wordlist_index, save_path=os.path.join(save_dir, "test_ambiguous_all.p"), ambiguous='all') # Generate embedding. embedding = [] for i in range(len(wordlist)): word = wordlist[i] word_embedding = {} word_embedding['index'] = i word_embedding['word'] = word if "add_05" in word: word_embedding['vector'] = create_word_vector( "add05", normalize_filler_distribution=normalize_filler_distribution) elif "subtract_05" in word: word_embedding['vector'] = create_word_vector( "subtract05", normalize_filler_distribution=normalize_filler_distribution) elif word == 'zzz': print('generating zzz vector') word_embedding['vector'] = np.zeros(dims) else: word_embedding['vector'] = create_word_vector() embedding.append(word_embedding) with open(os.path.join(save_dir, "embedding.p"), "wb") as f: pickle.dump(embedding, f) with open(os.path.join(save_dir, "wordlist.p"), "wb") as f: pickle.dump(wordlist, f)
def generate_train3roles_testnewrole(num_persons_per_category, filler_distribution=None): NUM_TRAIN_EXAMPLES = 24000 NUM_TEST_EXAMPLES = 120 NUM_UNSEEN_TEST_EXAMPLES = 120 NUM_UNSEEN_FILLERS = 100 SAVE_PATH = os.path.join( "/", "home", "cc27", "Thesis", "generalized_schema_learning", "data", "generate_train3roles_testnewrole_withunseentestfillers_%dpersonspercategory_%dtrain_%dtest" % (num_persons_per_category, NUM_TRAIN_EXAMPLES, NUM_TEST_EXAMPLES)) if not os.path.exists(SAVE_PATH): os.mkdir(SAVE_PATH) STORY_FRAME = "begin subject sit subject friend announce emcee perform poet consume dessert drink goodbye".split( " ") QUESTIONS = ["QEmcee", "QFriend", "QPoet", "QSubject"] ROLES = ["emcee", "friend", "poet", "subject", "dessert", "drink"] PADDING_WORD = "zzz" NUM_PERSON_FILLERS = 4 filler_indices = [] for role in ROLES: filler_indices += list(np.where(np.array(STORY_FRAME) == role)[0]) person_fillers = [ str(i) for i in range(num_persons_per_category * NUM_PERSON_FILLERS) ] person_fillers_unseenintraining = [ str(-1 * i) for i in range(1, NUM_UNSEEN_FILLERS + 1) ] wordslist = list( set(STORY_FRAME + QUESTIONS + person_fillers + person_fillers_unseenintraining)) wordslist.append(PADDING_WORD) story_frame_matrix = np.expand_dims(np.expand_dims(np.array( [wordslist.index(word) for word in STORY_FRAME]), axis=1), axis=0) person_fillers_indices = [ wordslist.index(filler) for filler in person_fillers ] person_fillers_unseenintraining_indices = [ wordslist.index(filler) for filler in person_fillers_unseenintraining ] person_fillers_bytrainmissingrole = {} for i in range(NUM_PERSON_FILLERS): person_fillers_bytrainmissingrole[ROLES[i]] = person_fillers_indices[ i * num_persons_per_category:(i + 1) * num_persons_per_category] person_fillers_bytrainrole = {} for i in range(NUM_PERSON_FILLERS): role = ROLES[i] person_fillers_bytrainrole[role] = np.array( list( set(person_fillers_indices) - set(person_fillers_bytrainmissingrole[role]))) # Generate train data. PERSON_ROLES = ["emcee", "friend", "poet", "subject"] person_wordindices = {} for role in PERSON_ROLES: person_wordindices[role] = wordslist.index(role) person_storyindices = {} for role in PERSON_ROLES: person_storyindices[role] = np.where( np.squeeze(story_frame_matrix) == person_wordindices[role])[0] role_wordindices = {} for role in ROLES: role_wordindices[role] = wordslist.index(role) role_storyindices = {} for role in ROLES: role_storyindices[role] = np.where( np.squeeze(story_frame_matrix) == role_wordindices[role])[0] question_wordindices = [ wordslist.index(question) for question in QUESTIONS ] question_storyindices = { question: STORY_FRAME.index(role) for question, role in zip(question_wordindices, ROLES) } padding = np.reshape(np.array([wordslist.index(PADDING_WORD)]), (1, 1, 1)) train_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) train_y = np.empty((0, 1)) for i in range(NUM_TRAIN_EXAMPLES): story = np.copy(story_frame_matrix) for role in PERSON_ROLES: filler = np.random.choice(person_fillers_bytrainrole[role]) story[0, person_storyindices[role], 0] = filler question = np.random.choice(question_wordindices) answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) train_X = np.concatenate((train_X, story), axis=0) train_y = np.concatenate((train_y, np.reshape(np.array(answer), (1, 1))), axis=0) with open(os.path.join(SAVE_PATH, "train.p"), "wb") as f: pickle.dump([train_X, train_y], f) test_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) test_y = np.empty((0, 1)) for i in range(NUM_TEST_EXAMPLES): story = np.copy(story_frame_matrix) for role in PERSON_ROLES: filler = np.random.choice(person_fillers_bytrainmissingrole[role]) story[0, person_storyindices[role], 0] = filler question = np.random.choice(question_wordindices) answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) test_X = np.concatenate((test_X, story), axis=0) test_y = np.concatenate((test_y, np.reshape(np.array(answer), (1, 1))), axis=0) with open(os.path.join(SAVE_PATH, "test.p"), "wb") as f: pickle.dump([test_X, test_y], f) for question in question_wordindices: split_testunseen_X = np.empty( (0, story_frame_matrix.shape[1] + 2, story_frame_matrix.shape[2])) split_testunseen_y = np.empty((0, 1)) for i in range(NUM_UNSEEN_TEST_EXAMPLES): story = np.copy(story_frame_matrix) for role in ROLES: filler = np.random.choice( person_fillers_unseenintraining_indices) story[0, role_storyindices[role], 0] = filler answer = [story.squeeze()[question_storyindices[question]]] story = np.concatenate( (story, padding, np.reshape(question, (1, 1, 1))), axis=1) split_testunseen_X = np.concatenate((split_testunseen_X, story), axis=0) split_testunseen_y = np.concatenate( (split_testunseen_y, np.reshape(np.array(answer), (1, 1))), axis=0) with open( os.path.join(SAVE_PATH, "test_%s_unseen.p" % wordslist[question]), "wb") as f: pickle.dump([split_testunseen_X, split_testunseen_y], f) embedding = [] for i in range(len(wordslist)): word = wordslist[i] word_embedding = {} word_embedding['index'] = i word_embedding['word'] = word if word in person_fillers or word in person_fillers_unseenintraining: print(word, filler_distribution) word_embedding['vector'] = create_word_vector(filler_distribution) else: word_embedding['vector'] = create_word_vector() embedding.append(word_embedding) with open(os.path.join(SAVE_PATH, "embedding.p"), "wb") as f: pickle.dump(embedding, f) with open(os.path.join(SAVE_PATH, "wordslist.p"), "wb") as f: pickle.dump(wordslist, f)
def generate_onefillerperrole(): NUM_TRAIN_EXAMPLES = 24000 NUM_TEST_EXAMPLES = 120 NUM_DIMS = 50 SAVE_PATH = os.path.join("/", "home", "cc27", "Thesis", "generalized_schema_learning", "data", "storyv2_train20000_AllQs") STORY_FRAME = "begin subject sit subject friend announce emcee perform poet consume dessert drink goodbye".split(" ") QUESTIONS = ["QEmcee", "QFriend", "QPoet", "QSubject", "QDessert", "QDrink"] ROLES = ["emcee", "friend", "poet", "subject", "dessert", "drink"] PADDING_WORD = "zzz" filler_indices = [] for role in ROLES: filler_indices += list(np.where(np.array(STORY_FRAME) == role)[0]) num_questions = len(QUESTIONS) wordslist = list(set(STORY_FRAME + QUESTIONS)) wordslist.append(PADDING_WORD) story_frame_matrix = np.expand_dims(np.expand_dims(np.array([wordslist.index(word) for word in STORY_FRAME]), axis=1), axis=0) def generate_data(num_examples, questions, roles): num_questions = len(questions) stories = np.repeat(story_frame_matrix, num_questions, axis=0) padding = np.reshape(np.repeat([wordslist.index(PADDING_WORD)], num_questions), (num_questions, 1, 1)) queries = np.reshape(np.array([wordslist.index(question) for question in questions]), (num_questions, 1, 1)) stories = np.concatenate((stories, padding, queries), axis=1) answers = np.reshape(np.array([wordslist.index(role) for role in roles]), (num_questions, 1)) num_repeats = num_examples // num_questions stories = np.repeat(stories, num_repeats, axis=0) answers = np.repeat(answers, num_repeats, axis=0) return stories, answers train_X, train_y = generate_data(NUM_TRAIN_EXAMPLES, QUESTIONS, ROLES) test_X, test_y = generate_data(NUM_TEST_EXAMPLES, QUESTIONS, ROLES) with open(os.path.join(SAVE_PATH, "train.p"), "wb") as f: pickle.dump([train_X, train_y], f) with open(os.path.join(SAVE_PATH, "test.p"), "wb") as f: pickle.dump([test_X, test_y], f) for question, role in zip(QUESTIONS, ROLES): split_test_X, split_test_y = generate_data(NUM_TEST_EXAMPLES, [question], [role]) with open(os.path.join(SAVE_PATH, "test_%s.p" % question), "wb") as f: pickle.dump([split_test_X, split_test_y], f) import sys sys.path.append("../") from directories import base_dir from embedding_util import create_word_vector embedding = [] for i in range(len(wordslist)): word = wordslist[i] word_embedding = {} word_embedding['index'] = i word_embedding['word'] = word word_embedding['vector'] = create_word_vector() embedding.append(word_embedding) with open(os.path.join(SAVE_PATH, "embedding.p"), "wb") as f: pickle.dump(embedding, f) with open(os.path.join(SAVE_PATH, "wordslist.p"), "wb") as f: pickle.dump(wordslist, f)
def write_csw_experiment(experiment_name, num_examples_per_frame, num_unseen_examples_per_frame): print(experiment_name) """Create train and test sets for a role-filler binding experiment. Assumes story files have been written by Coffee Shop world. Args: experiment_name: Name of folder in which stories are stored. Assumes stories are stored in the directory home_dir + "narrative/story/", where home_dir is defined in directories.py. Saves (in the directory base_dir + "data/experiment_name/", where base_dir is defined in directories.py): train.p: A pickle file containing: X: [num_train_examples x num_words_per_story x 1] matrix of train inputs. y: [num_train_examples x 1] matrix of correct train outputs. test.p: A pickle file containing: X: [num_test_examples x num_words_per_story x 1] matrix of test inputs. y: [num_test_examples x 1] matrix of correct test outputs. Xy_english.txt: A file containing human-readable versions of the inputs, correct outputs, and the word list used in the experiment. (Each X and y matrix represents words by their index in the word list.) """ experiment_name += "_AllQs" experiment_data_path = os.path.join(base_dir, "data", experiment_name) query_delimiter = "?" query_starter = "Q" padding_word = "zzz" distributions_dict = {"DESSERT": "A", "DRINK": "B", "EMCEE": "A", "FRIEND": "B", "POET": "A", "SUBJECT": "B"} if not os.path.exists(experiment_data_path): os.makedirs(experiment_data_path) # Create frames. with open('story_frame.json', 'r') as f: story_frame_info = json.load(f) transitions = story_frame_info['transitions'] state_contents = story_frame_info['state_contents'] role_types = story_frame_info['role_types'] state_sequences = construct_all_state_sequences(transitions) assert(len(state_sequences) == 24) frames = [flatten_arrays([state_contents[state] for state in state_sequence]) for state_sequence in state_sequences] num_examples = len(frames) * num_examples_per_frame num_unseen_examples = len(frames) * num_unseen_examples_per_frame if 'variablefiller' in experiment_name: dummy_instances = {role: ['%sFILLER' % role] for role in role_types.keys()} train_instances, test_instances = dummy_instances, dummy_instances elif 'fixedfiller' in experiment_name: train_instances, test_instances = hard_coded_things.fixed_train_instances, hard_coded_things.fixed_test_instances query_choices = role_types.keys() wordslist = flatten_arrays(state_contents.values()) + flatten_arrays(train_instances.values()) + flatten_arrays(test_instances.values()) + [padding_word, query_delimiter] for query_choice in query_choices: wordslist.append(query_starter + query_choice) wordslist = list(set(wordslist)) # Determine experiment information. max_story_length = max([len(frame) for frame in frames]) input_dims = max_story_length + 3 # +2 for query delimiter and the actual query. +1 for padding at end. X = np.zeros([num_examples, input_dims, 1], dtype=np.int32) y = np.zeros([num_examples, 1], dtype=np.int32) test_unseen_X = np.zeros([num_unseen_examples, input_dims, 1], dtype=np.int32) test_unseen_y = np.zeros([num_unseen_examples, 1], dtype=np.int32) # Generate inputs and correct outputs from stories. for frame_index, frame in enumerate(frames): print('Generating for frame ', frame) padding_size = max_story_length - len(frame) frame_roles = [role for role in role_types.keys() if role in frame] for example_index in range(num_examples_per_frame): if example_index % 1000 == 0: print(example_index) story = copy.deepcopy(frame) role_assignments = {} for role in frame_roles: if 'fixedfiller' in experiment_name: role_assignment = np.random.choice(train_instances[role_types[role]]) while role_assignment in role_assignments.values(): role_assignment = np.random.choice(train_instances[role_types[role]]) elif 'variablefiller' in experiment_name: role_assignment = '%sFILLER' % role role_assignments[role] = role_assignment story = [role_assignments[word] if word in role_assignments else word for word in story] queried_role = np.random.choice(list(role_assignments.keys())) query = query_starter + queried_role response = role_assignments[queried_role] # If necessary, add padding to end of story (ensures that inputs are all the same length). story += [padding_word] * (padding_size + 1) # so we can shift all stories later. story += [query_delimiter, query] outputs = [response] # Convert to numerical representation and add to X and y. data_index = (num_examples_per_frame * frame_index) + example_index X[data_index, :, :] = np.expand_dims([wordslist.index(storyword) for storyword in story], axis=1) y[data_index, :] = [wordslist.index(output_word) for output_word in outputs] if 'fixedfiller' in experiment_name: for example_index in range(num_unseen_examples_per_frame): story = copy.deepcopy(frame) role_assignments = {} for role in frame_roles: role_assignment = np.random.choice(test_instances[role_types[role]]) while role_assignment in role_assignments.values(): role_assignment = np.random.choice(test_instances[role_types[role]]) role_assignments[role] = role_assignment story = [role_assignments[word] if word in role_assignments else word for word in story] queried_role = np.random.choice(list(role_assignments.keys())) query = query_starter + queried_role response = role_assignments[queried_role] # If necessary, add padding to end of story (ensures that inputs are all the same length). story += [padding_word] * (padding_size + 1) # so we can shift all stories later. story += [query_delimiter, query] outputs = [response] # Convert to numerical representation and add to X and y. data_index = (num_unseen_examples_per_frame * frame_index) + example_index test_unseen_X[data_index, :, :] = np.expand_dims([wordslist.index(storyword) for storyword in story], axis=1) test_unseen_y[data_index, :] = [wordslist.index(output_word) for output_word in outputs] # Assert no repeated stories. X, unique_seen_indices = np.unique(X, axis=0, return_index=True) y = y[unique_seen_indices] if 'fixedfiller' in experiment_name: num_train = int(4 * len(X) / 5) train_indices = np.random.choice(len(X), num_train, replace=False) test_indices = np.array([idx for idx in range(len(X)) if idx not in train_indices]) train_X = X[train_indices, :, :] train_y = y[train_indices, :] test_X = X[test_indices, :, :] test_y = y[test_indices, :] test_unseen_X, unique_unseen_indices = np.unique(test_unseen_X, axis=0, return_index=True) test_unseen_y = test_unseen_y[unique_unseen_indices] elif 'variablefiller' in experiment_name: train_X, train_y = X, y test_X, test_y = X, y test_unseen_X, test_unseen_y = X, y # Save data into pickle files. if not os.path.exists(experiment_data_path): os.makedirs(experiment_data_path) print(experiment_data_path) with open(os.path.join(experiment_data_path, 'train.p'), 'wb') as f: pickle.dump([train_X, train_y], f) with open(os.path.join(experiment_data_path, 'test.p'), 'wb') as f: pickle.dump([test_X, test_y], f) with open(os.path.join(experiment_data_path, 'test_unseen.p'), 'wb') as f: pickle.dump([test_unseen_X, test_unseen_y], f) with open(os.path.join(experiment_data_path, 'wordslist.p'), 'wb') as f: pickle.dump(wordslist, f) with open('../experiment_parameters.json', 'r') as f: experiment_parameters = json.load(f) experiment_parameters['input_dims'][experiment_name] = input_dims fillers = list(set(flatten_arrays(train_instances.values()) + flatten_arrays(test_instances.values()))) experiment_parameters['filler_indices'][experiment_name] = [wordslist.index(filler) for filler in fillers] experiment_parameters['padding_indices'][experiment_name] = wordslist.index(padding_word) if 'variablefiller' in experiment_name: experiment_parameters['query_to_filler_index'][experiment_name] = {wordslist.index(query_starter + role): [wordslist.index(filler) for filler in dummy_instances[role]] for role in role_types.keys()} filler_distributions_dict = {wordslist.index(dummy_instances[role][0]): distributions_dict[role] for role in role_types.keys()} experiment_parameters['filler_distributions'][experiment_name] = filler_distributions_dict with open('../experiment_parameters.json', 'w') as f: json.dump(experiment_parameters, f) embedding = [] for i in range(len(wordslist)): word = wordslist[i] word_embedding = {} word_embedding['index'] = i word_embedding['word'] = word word_embedding['vector'] = create_word_vector() embedding.append(word_embedding) with open(os.path.join(experiment_data_path, "embedding.p"), 'wb') as f: pickle.dump(embedding, f)
def generate_batch(X, y, FLAGS, embedding, do_shift_inputs=True, noise_proportion=0.1, zero_vector_noise=False): """Generate a train batch. Constructs batches using one of three possible representations (specified by FLAGS.filler_type): fixed_filler: Each word vector is specified by the embedding argument. variable_filler: Each non-filler word vector is specified by the embedding argument. Each filler word (manually specified for each experiment) represented by a new randomly generated vector in each story. Args: X: [num_examples x num_words_per_input x 1] matrix of inputs. y: [num_examples x 1] matrix of correct outputs. FLAGS: Parameters object of experiment information. embedding: [num_words x embedding_dims] matrix of word embeddings. NOTE: irrelevant if using one-hot embedding. Returns: A generator containing batch_size examples, each of which contains: X: [batch_size x num_words_per_input x num_dimensions_per_word] matrix of inputs. y: [batch_size x num_dimensions_per_word] matrix of correct outputs. embedding: [num_words_in_corpus x num_dimensions_per_word] matrix of vectors representing words in the batch. """ batch_size, filler_type = FLAGS.batch_size, FLAGS.filler_type data_size = len(X) num_batches = (data_size // batch_size) for batch_num in range(num_batches): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_size) if filler_type == "fixed_filler": if do_shift_inputs: yield embedding[shift_inputs(X[start_index:end_index].squeeze(), FLAGS.experiment_name)], embedding[y[start_index:end_index].squeeze()], embedding else: yield embedding[X[start_index:end_index]].squeeze(), embedding[y[start_index:end_index].squeeze()], embedding elif "variable_filler" in filler_type: # NOTE: Filler indices manually determined using word list saved by experiment creators. if "distributions" in filler_type: filler_indices_and_distributions = experiment_parameters["filler_distributions"][FLAGS.experiment_name] query_to_filler_indices = experiment_parameters["query_to_filler_index"][FLAGS.experiment_name] filler_indices = list(filler_indices_and_distributions.keys()) filler_distributions = [filler_indices_and_distributions[filler_index] for filler_index in filler_indices] filler_indices = [int(index) for index in filler_indices] else: filler_indices = experiment_parameters["filler_indices"][FLAGS.experiment_name] batchX, batchy = X[start_index:end_index].squeeze(), y[start_index:end_index].squeeze() if FLAGS.function != "analyze" and do_shift_inputs: # Don't randomly shift inputs for decoding analysis. batchX = shift_inputs(batchX, FLAGS.experiment_name) embeddingX, embeddingy = embedding[batchX], embedding[batchy] padding_index = experiment_parameters['padding_indices'][FLAGS.experiment_name] padding_vector = embedding[padding_index] epoch_embedding = embedding for examplenum in range(batch_size): # Create new random embedding for each filler. num_fillers = len(filler_indices) new_filler_embedding = np.empty((num_fillers, embedding_size)) if "distributions" in filler_type: for j, filler_distribution in enumerate(filler_distributions): if "variable_filler_distributions_no_subtract" in filler_type: new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="C") elif "variable_filler_distributions_one_distribution" in filler_type: new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution=filler_distribution, dominant_distribution_proportion=1) elif "variable_filler_distributions_all_randn_distribution" in filler_type: new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="randn") elif "variable_filler_distributions_A" in filler_type: new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="A", dominant_distribution_proportion=1) elif "variable_filler_distributions_B" in filler_type: new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="B", dominant_distribution_proportion=1) elif "variable_filler_distributions_5050_AB" in filler_type: new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution="B", dominant_distribution_proportion=0.5) else: new_filler_embedding[j, :] = embedding_util.create_word_vector(filler_distribution=filler_distribution) else: for j in range(num_fillers): new_filler_embedding[j, :] = embedding_util.create_word_vector() # Replace filler embedding with new random embedding. filler_ix_X = np.where(np.isin(batchX[examplenum], filler_indices)) new_embedding_ix_X = [filler_indices.index(i) for i in batchX[examplenum, filler_ix_X][0]] embeddingX[examplenum, filler_ix_X] = new_filler_embedding[new_embedding_ix_X] if "noise" in filler_type: if np.random.rand() < noise_proportion: print('noise trial') queried_filler_index = query_to_filler_indices[str(batchX[examplenum, -1])] queried_filler_indices = np.where(batchX[examplenum] == queried_filler_index) if zero_vector_noise: print('zero vector noise') embeddingX[examplenum, queried_filler_indices] = np.zeros(padding_vector.shape) else: embeddingX[examplenum, queried_filler_indices] = padding_vector new_embedding_ix_y = [filler_indices.index(batchy[examplenum])] embeddingy[examplenum] = new_filler_embedding[new_embedding_ix_y] # Append embedding to original embedding identifying response. epoch_embedding = np.concatenate((epoch_embedding, new_filler_embedding), axis=0) yield embeddingX, embeddingy, epoch_embedding