def run_RSA(stim_file, vocab_file, model_files, header=False, multisent_flag = False, filter_file = None, verbose=False, embedding=False): ''' Given a stimuli file, model vocabulary file and model files return information about information theoretic measures and similarity''' #hard code data_dir data_path = './' #set loss function to be cross entropy criterion = nn.CrossEntropyLoss() #Load experiments EXP = data.Stim(stim_file, header, filter_file, vocab_file) #Loop through the models for model_file in model_files: if verbose: print('testing model:', model_file) #load the model with open(model_file, 'rb') as f: #run on local cpu for now model = torch.load(f, map_location='cpu') # make in continous chunk of memory for speed if isinstance(model, torch.nn.DataParallel): model = model.module model.rnn.flatten_parameters() model.eval() #loop through experimental items for EXP for x in range(len(EXP.UNK_SENTS)): sentences = list(EXP.UNK_SENTS[x]) target = sentences[:1] sentences = sentences[1:] #Create corpus wrapper (this is for one hoting data) corpus = data_test.TestSent(data_path, vocab_file, target, False) #Get one hots target_ids = corpus.get_data() #Create corpus wrapper (this is for one hoting data) corpus = data_test.TestSent(data_path, vocab_file, sentences, multisent_flag) #Get one hots sent_ids = corpus.get_data() sims = get_sims(target_ids, sent_ids, corpus, model, embedding) values = test_IT(sent_ids, corpus, model) EXP.load_IT(model_file, x, values, multisent_flag, sims) return EXP
def run_ELMo_RSA(stim_file, header=False, filter_file=None): EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE) #Get tokenizer tokenizer = WhitespaceTokenizer() #Load model ##ELMo OG elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json' #ELMo Small #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #ELMo Medium #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json' #ELMo OG (5.5B) #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file, weight_file=elmo_weight_file, dropout=0.0) embedder = BasicTextFieldEmbedder( token_embedders={'elmo_tokens': elmo_embedding}) for x in range(len(EXP.SENTS)): sentences = list(EXP.SENTS[x]) target = sentences[0] sentence = sentences[1] #GET BASELINE token_indexer = ELMoTokenCharactersIndexer() vocab = Vocabulary() target_tokens = tokenizer.tokenize(target) target_text_field = TextField(target_tokens, {'elmo_tokens': token_indexer}) target_text_field.index(vocab) target_token_tensor = target_text_field.as_tensor( target_text_field.get_padding_lengths()) target_tensor_dict = target_text_field.batch_tensors( [target_token_tensor]) target_embedding = embedder(target_tensor_dict)[0] baseline = target_embedding[-1].data.cpu().squeeze() #GET SIMS sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder) values = get_dummy_values(sentence) EXP.load_IT('elmo', x, values, False, sims) return EXP
def check_unk(stim_file, vocab_file, header=False, filter_file=None, verbose=False): ''' Given a stimuli file and model vocabulary file return UNK'd stimuli.''' #hard code data_dir data_path = './' #Load experiments EXP = data.Stim(stim_file, header, filter_file, vocab_file) EXP.check_unks() return EXP
def run_BERT_RSA(stim_file, layer, header=False, filter_file=None): EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE) #Load BERT uncased pretrained_weights = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(pretrained_weights) model = BertModel.from_pretrained(pretrained_weights, output_hidden_states=True) #tokenizer = AutoTokenizer.from_pretrained("nyu-mll/roberta-base-100M-3") #tokenizer = AutoTokenizer.from_pretrained("nyu-mll/roberta-base-1B-3") #model = RobertaForMaskedLM.from_pretrained("nyu-mll/roberta-base-1B-3", output_hidden_states=True) model.eval() model.zero_grad() for x in range(len(EXP.SENTS)): sentences = list(EXP.SENTS[x]) target = sentences[0] sentences = sentences[1:] #GET BASELINE target_encoded = tokenizer.encode(target) target_ids = torch.tensor(target_encoded).unsqueeze(0) hidden_states = model(target_ids)[-1] embed, hidden_states = hidden_states[:1], hidden_states[1:] hidden_states = hidden_states[layer][0] baseline_word = tokenizer.decode(torch.tensor([target_encoded[-2] ])).strip() baseline = hidden_states[-2].data.cpu().squeeze() sims = get_BERT_sims(sentences[0], layer, baseline, tokenizer, model) values = get_dummy_values(sentences[0]) EXP.load_IT('bert-uncased', x, values, False, sims) return EXP
def run_norming(stim_file, vocab_file, model_files, header=False, multisent_flag = False, filter_file = None, verbose=False): ''' Given a stimuli file, model vocabulary file and model files return information about frequency and information theoretic measures''' #hard code data_dir data_path = './' #set loss function to be cross entropy criterion = nn.CrossEntropyLoss() #Load experiments EXP = data.Stim(stim_file, header, filter_file, vocab_file) #Loop through the models for model_file in model_files: if verbose: print('testing model:', model_file) #load the model with open(model_file, 'rb') as f: #run on local cpu for now model = torch.load(f, map_location='cpu') #loop through experimental items for EXP for x in range(len(EXP.UNK_SENTS)): sentences = list(EXP.UNK_SENTS[x]) #Create corpus wrapper (this is for one hoting data) corpus = data_test.TestSent(data_path, vocab_file, sentences, multisent_flag) #Get one hots sent_ids = corpus.get_data() values = test_IT(sent_ids, corpus, model) EXP.load_IT(model_file, x, values, multisent_flag) return EXP
def run_GPT_RSA(stim_file, layer, header=False, filter_file=None): EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE) #Get tokenizer tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl') #Load model model = GPT2LMHeadModel.from_pretrained( 'gpt2-xl', output_hidden_states=True) #, force_download=True) #turn off learning model.zero_grad() for x in range(len(EXP.SENTS)): sentences = list(EXP.SENTS[x]) target = sentences[0] sentence = sentences[1] #GET BASELINE target_encoded = tokenizer.encode(target, add_special_tokens=True, add_prefix_space=True) target_input_ids = torch.tensor(target_encoded).unsqueeze(0) #Get model outputs output = model(target_input_ids) predictions, mems, hidden_states = output hidden_states = hidden_states[1:] baseline = hidden_states[layer][0][-1].data.cpu().squeeze() #GET SIMs sims = get_GPT_sims(sentence, layer, baseline, tokenizer, model) values = get_dummy_values(sentence) EXP.load_IT('gpt2', x, values, False, sims) return EXP