def __init__(self, max_sent_length, max_sent_src, max_sent_trg, data_folder, model_folder, pretrain_path, prefix, source_file, target_file, use_gensen_w2i, device_ids=[0], data_parallelize=False, test=False): """ :param max_sent_length: max words in sentence gensen_h --> batch size x max_len x rep_size :param max_sent_src: number of sentences in source doc :param max_sent_trg: number of sentences in target doc :param data_folder: data location :param model_folder: location of pretrained gensen :param pretrain_path: location of pretrained embeddings (e.g. Glove) :param prefix: used of the type of gensen ["nli_large"+"bothskip"+"arxiv"] :param source_file: name of source file in data_folder :param target_file: name of target file in data_folder :param use_gensen_w2i: use the word to ids for pretrained gensen :param device_ids: used when data_parallelize = True, specify devices to use :param data_parallelize: :param test: """ self.max_len = max_sent_length # max words self.max_sent_src = max_sent_src # max sentences src self.max_sent_trg = max_sent_trg # max sentences trg self.data_folder = data_folder self.source_file = source_file self.target_file = target_file self.src_data = [] self.atrg_data = [] self.data_parallelize = data_parallelize self.device_ids = device_ids self.test = test logging.debug(""" max_len: {}, max_sent_src: {}, max_sent_trg: {}, data_folder: {}, source_file: {}, target_file: {} """.format(self.max_len, self.max_sent_src, self.max_sent_trg, self.data_folder, self.source_file, self.target_file)) self.gensen = GenSenSingle(model_folder=model_folder, filename_prefix=prefix, pretrained_emb=pretrain_path, cuda=True, max_sentence_length=max_sent_length, data_parallelize=data_parallelize, device_ids=device_ids[::-1]) self.sen_rep_dim = self.gensen.sen_rep_dim self.vocab_size = self.gensen.vocab_size self.emb_dim = self.gensen.embedding_dim self.vocab_expansion(use_gensen_w2i)
def __init__(self): """Initalizes object """ self.__encoder = GenSenSingle( model_folder=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models'), filename_prefix='nli_large', pretrained_emb=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'embedding', 'glove.840B.300d.h5') ) with open(os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models', 'senteval.pickle'), 'rb') as file: self.__evaluator = pickle.load(file) self.__mutex = Lock()
def get_gensen_synset_definitions(entity_file, vocab_file, gensen_file): from gensen import GenSen, GenSenSingle gensen_1 = GenSenSingle( model_folder='./data/models', filename_prefix='nli_large_bothskip', pretrained_emb='./data/embedding/glove.840B.300d.h5') gensen_1.eval() definitions = {} with open(entity_file, 'r') as fin: for line in fin: node = json.loads(line) if node['type'] == 'synset': definitions[node['id']] = node['definition'] with open(vocab_file, 'r') as fin: vocab_list = fin.read().strip().split('\n') # get the descriptions sentences = [''] * NUM_EMBEDDINGS for k, entity in enumerate(vocab_list): definition = definitions.get(entity) if definition is None: assert entity in ('@@UNKNOWN@@', '@@MASK@@', '@@NULL@@') else: sentences[k + 1] = definition embeddings = np.zeros((NUM_EMBEDDINGS, 2048), dtype=np.float32) for k in range(0, NUM_EMBEDDINGS, 32): sents = sentences[k:(k + 32)] reps_h, reps_h_t = gensen_1.get_representation(sents, pool='last', return_numpy=True, tokenize=True) embeddings[k:(k + 32), :] = reps_h_t print(k) with h5py.File(gensen_file, 'w') as fout: ds = fout.create_dataset('gensen', data=embeddings)
def __init__(self, train_loader, validation_loader, test_loader, device): self.train_loader = train_loader self.validation_loader = validation_loader self.test_loader = test_loader self.clip = 5 self.latent_size = 2048 self.decoder_hidden_size = 2048 self.decoder_layers = 2 self.noise = Normal(torch.tensor([0.0], requires_grad=False), torch.tensor([0.12], requires_grad=False)) self.encoder = GenSenSingle( model_folder='./data/models', filename_prefix='nli_large_bothskip', pretrained_emb='./data/embedding/glove.840B.300d.h5', cuda=torch.cuda.is_available()) vocab_size = len(self.encoder.encoder.src_embedding.weight) self.encoder.encoder.to(device) self.decoder = Decoder(self.decoder_hidden_size, self.latent_size, vocab_size, self.decoder_layers, device=device, clip=5).to(device) weight_mask = torch.ones(vocab_size).cuda() weight_mask[self.encoder.word2id['<pad>']] = 0 self.criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda() self.bce = nn.BCELoss() self.device = device self.embedding_norms = torch.norm( self.encoder.encoder.src_embedding.weight, 1) print(self.decoder)
# W2V_PATH = '/Users/karanjani/Desktop/InferSent-master/dataset/fastText/crawl-300d-2M.vec' #ENTER PATH TO FASTTEXT # infersent.set_w2v_path(W2V_PATH) # infersent.build_vocab(cleanedStrings, tokenize=True) # embeddings = infersent.encode(cleanedStrings, tokenize=True) # fbvecFrame = pd.DataFrame(list(embeddings)) #converting Facebook embeddings tuple to dataframe # FBcols = ["FB%d" % d for d in range(4096)] #creating list of column names for Facebook vectors # fbvecFrame.columns = FBcols #reset column names to be FB1, FB2 ... FB4096 # fullFrame = pd.concat([df, fbvecFrame], axis=1) #creating new dataframe with Facebook vectors ################################ ###### GENSEN EMBEDDINGS ###### ################################ gensen_1 = GenSenSingle(model_folder='/Users/karanjani/Desktop/gensen/data/models',filename_prefix='nli_large_bothskip',pretrained_emb='/Users/karanjani/Desktop/gensen/data/embedding/glove.840B.300d.h5') gensen_2 = GenSenSingle(model_folder='/Users/karanjani/Desktop/gensen/data/models',filename_prefix='nli_large_bothskip_parse',pretrained_emb='/Users/karanjani/Desktop/gensen/data/embedding/glove.840B.300d.h5') # reps_h, reps_h_t = gensen_1.get_representation(messageString, pool='last', return_numpy=True, tokenize=True) gensen = GenSen(gensen_1, gensen_2) reps_h, reps_h_t = gensen.get_representation(cleanedStrings, pool='last', return_numpy=True, tokenize=True) gsvecFrame = pd.DataFrame(reps_h_t) GScols = ["GS%d" % d for d in range(4096)] gsvecFrame.columns = GScols # fullFrame = pd.concat([fullFrame, gsvecFrame], axis=1) fullFrame = pd.concat([df, gsvecFrame], axis=1) ################################ ###### GOOGLE EMBEDDINGS ###### ################################
class EncodingIteratorBase(DataIterator): """ Base generator class of sentence encodings.""" def __init__(self, max_sent_length, max_sent_src, max_sent_trg, data_folder, model_folder, pretrain_path, prefix, source_file, target_file, use_gensen_w2i, device_ids=[0], data_parallelize=False, test=False): """ :param max_sent_length: max words in sentence gensen_h --> batch size x max_len x rep_size :param max_sent_src: number of sentences in source doc :param max_sent_trg: number of sentences in target doc :param data_folder: data location :param model_folder: location of pretrained gensen :param pretrain_path: location of pretrained embeddings (e.g. Glove) :param prefix: used of the type of gensen ["nli_large"+"bothskip"+"arxiv"] :param source_file: name of source file in data_folder :param target_file: name of target file in data_folder :param use_gensen_w2i: use the word to ids for pretrained gensen :param device_ids: used when data_parallelize = True, specify devices to use :param data_parallelize: :param test: """ self.max_len = max_sent_length # max words self.max_sent_src = max_sent_src # max sentences src self.max_sent_trg = max_sent_trg # max sentences trg self.data_folder = data_folder self.source_file = source_file self.target_file = target_file self.src_data = [] self.atrg_data = [] self.data_parallelize = data_parallelize self.device_ids = device_ids self.test = test logging.debug(""" max_len: {}, max_sent_src: {}, max_sent_trg: {}, data_folder: {}, source_file: {}, target_file: {} """.format(self.max_len, self.max_sent_src, self.max_sent_trg, self.data_folder, self.source_file, self.target_file)) self.gensen = GenSenSingle(model_folder=model_folder, filename_prefix=prefix, pretrained_emb=pretrain_path, cuda=True, max_sentence_length=max_sent_length, data_parallelize=data_parallelize, device_ids=device_ids[::-1]) self.sen_rep_dim = self.gensen.sen_rep_dim self.vocab_size = self.gensen.vocab_size self.emb_dim = self.gensen.embedding_dim self.vocab_expansion(use_gensen_w2i) def vocab_expansion(self, use_gensen_w2i): """ Read data from files.""" if self.test: logging.debug(" Testing with 100 documents") files = [self.source_file, self.target_file] data = [self.src_data, self.atrg_data] maxes_sen = [self.max_sent_src, self.max_sent_trg] for file, dt, max_sen in zip(files, data, maxes_sen): with open('%s/%s' % (self.data_folder, file), 'r', encoding="utf-8") as source: doc = [] for sentence in source: if doc and sentence.startswith("\n"): if len(doc) > max_sen: doc = doc[0:max_sen] dt.append(doc) doc = [] elif sentence.strip(): doc.append(sentence.strip()) if self.test and len(dt) > test_num_docs: break self.num_docs = len(self.src_data) assert self.num_docs == len(self.atrg_data) logging.info(" Constructing vocabulary...") if use_gensen_w2i: # if True does not construct a new vocab self.word2id = self.gensen.word2id self.id2word = self.gensen.id2word else: self.word2id, self.id2word = self.construct_vocab( list(chain.from_iterable(self.src_data)) + list(chain.from_iterable(self.atrg_data)), self.vocab_size) self.gensen.vocab_expansion(self.word2id.keys()) self.vocab_size = self.gensen.vocab_size logging.info(" Data has been read")
'I am pleased to inform all of those that believe in a strong, fair and sound Immigration Policy that Mark Morgan will be joining the Trump Administration as the head of our hard working men and women of ICE. Mark is a true believer and American Patriot. He will do a great job!', 'For too long, a small group in our nations Capital has reaped the rewards of government while the people have borne the cost. Washington flourished -- but the people did not share in its wealth. Politicians prospered -- but the jobs left, and the factories closed.' ] obama = [ 'Condolences to the family of John Singleton. His seminal work, Boyz n the Hood, remains one of the most searing, loving portrayals of the challenges facing inner-city youth. He opened doors for filmmakers of color to tell powerful stories that have been too often ignored.', 'This generation of climate activists is tired of inaction, and theyve caught the attention of leaders all over the world. So while this challenge is only getting more urgent, they show us the kind of action itll take to meet this moment.', 'That we are in the midst of crisis is now well understood. Our nation is at war, against a far-reaching network of violence and hatred. Our economy is badly weakened, a consequence of greed and irresponsibility on the part of some, but also our collective failure to make hard choices and prepare the nation for a new age. Homes have been lost; jobs shed; businesses shuttered. Our health care is too costly; our schools fail too many; and each day brings further evidence that the ways we use energy strengthen our adversaries and threaten our planet.' ] idx2speaker = [ 'trump1', 'trump2', 'trumpinaguration', 'obama1', 'obama2', 'obamainaguration', 'shakespeare1', 'shakespeare2', 'wutang1', 'wutang2', 'lukecombs', 'lukecombs' ] sentences = trump + obama + shakespeare + lukecombs + wutang gensen_1 = GenSenSingle(model_folder='./data/models', filename_prefix='nli_large_bothskip', pretrained_emb='./data/embedding/glove.840B.300d.h5') reps_h, reps_h_t = gensen_1.get_representation(sentences, pool='last', return_numpy=True, tokenize=True) x = [] for i in range(len(reps_h)): x.append(reps_h[i].mean(axis=0)) model = TSNE(n_components=2, perplexity=20, init='pca', method='exact', n_iter=5000) x = model.fit_transform(x)
s2vsingle[i].set_w2v_path(PATH_TO_W2V) s2vsingle[i] = s2vsingle[i].cuda() sent2vec = Sent2Vec(s2vsingle, 'concat') params_model = {'bsize': 64, 'pool_type': 'mean', 'which_layer': 'all', 'optfile': ELMO_OPTIONS, 'wgtfile': ELMO_WEIGHT} elmo = ELMo(params_model) elmo = elmo.cuda() gensen_1 = GenSenSingle( model_folder=FOLDER_PATH, filename_prefix=PREFIX1, pretrained_emb=PRETRAIN_EMB, cuda=True ) gensen_2 = GenSenSingle( model_folder=FOLDER_PATH, filename_prefix=PREFIX2, pretrained_emb=PRETRAIN_EMB, cuda=True ) gensen = GenSen(gensen_1, gensen_2) models = { 'sent2vec': sent2vec, 'elmo': elmo, 'gensen': gensen }
def __init__( self, train_vocab, n_movies, params, ): super(Recommender, self).__init__() self.params = params self.train_vocab = train_vocab self.n_movies = n_movies self.cuda_available = torch.cuda.is_available() # instantiate the gensen module that will be used in the encoder HRNN, and by the recommender module self.gensen = GenSenSingle( model_folder=os.path.join(config.MODELS_PATH, 'GenSen'), filename_prefix='nli_large', pretrained_emb=os.path.join(config.MODELS_PATH, 'embeddings/glove.840B.300d.h5'), cuda=self.cuda_available) self.gensen.vocab_expansion(list(train_vocab)) # HRNN encoder # Conversation encoder not bidirectional self.encoder = HRNN(params=params['hrnn_params'], train_vocabulary=train_vocab, gensen=self.gensen, train_gensen=False, conv_bidirectional=False) self.recommender_module = RecommendFromDialogue( params=params['recommend_from_dialogue_params'], train_vocab=train_vocab, n_movies=n_movies, gensen=self.gensen, ) if params['language_aware_recommender']: self.language_to_user = nn.Linear( in_features=params['hrnn_params'] ['conversation_encoder_hidden_size'], out_features=self.recommender_module.autorec. user_representation_size) # latent variable distribution parameters: latent_layer_sizes = params['latent_layer_sizes'] if latent_layer_sizes is not None: latent_variable_size = latent_layer_sizes[-1] self.prior_hidden_layers = nn.ModuleList([ nn.Linear(in_features=params['hrnn_params'] ['conversation_encoder_hidden_size'], out_features=latent_layer_sizes[0]) if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1], out_features=latent_layer_sizes[i]) for i in range(len(latent_layer_sizes) - 1) ]) penultimate_size = params['hrnn_params']['conversation_encoder_hidden_size'] \ if len(latent_layer_sizes) == 1 else latent_layer_sizes[-2] self.mu_prior = nn.Linear(penultimate_size, latent_variable_size) self.sigma_prior = nn.Linear(penultimate_size, latent_variable_size) # context size + size of sentence representations posterior_input_size = params['hrnn_params']['conversation_encoder_hidden_size'] +\ 2 * params['hrnn_params']['sentence_encoder_hidden_size'] + 1 self.posterior_hidden_layers = nn.ModuleList([ nn.Linear(in_features=posterior_input_size, out_features=latent_layer_sizes[0]) if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1], out_features=latent_layer_sizes[i]) for i in range(len(latent_layer_sizes) - 1) ]) penultimate_size = posterior_input_size if len( latent_layer_sizes) == 1 else latent_layer_sizes[-2] self.mu_posterior = nn.Linear(penultimate_size, latent_variable_size) self.sigma_posterior = nn.Linear(penultimate_size, latent_variable_size) context_size = params['hrnn_params'][ 'conversation_encoder_hidden_size'] if latent_layer_sizes is not None: context_size += latent_layer_sizes[-1] self.decoder = SwitchingDecoder(context_size=context_size, vocab_size=len(train_vocab), **params['decoder_params']) if self.cuda_available: self.cuda() self.decoder.set_pretrained_embeddings( self.encoder.gensen.encoder.src_embedding.weight.data)
# NOTE: To decide the pooling strategy for a new model, note down the validation set scores below. ) parser.add_argument("--cuda", help="Use GPU to compute sentence representations", default=torch.cuda.is_available()) args = parser.parse_args() print('#############################') print('####### Parameters ##########') print('Prefix 1 : %s ' % (args.prefix_1)) print('Prefix 2 : %s ' % (args.prefix_2)) print('Pretrained Embeddings : %s ' % (args.pretrain)) print('#############################') gensen_1 = GenSenSingle(model_folder=args.folder_path, filename_prefix=args.prefix_1, pretrained_emb=args.pretrain, cuda=args.cuda) gensen_2 = GenSenSingle(model_folder=args.folder_path, filename_prefix=args.prefix_2, pretrained_emb=args.pretrain, cuda=args.cuda) gensen = GenSen(gensen_1, gensen_2) params_senteval['gensen'] = gensen se = senteval.engine.SE(params_senteval, batcher, prepare) results_transfer = se.eval(transfer_tasks) print('--------------------------------------------') print('MR [Dev:%.1f/Test:%.1f]' % (results_transfer['MR']['devacc'], results_transfer['MR']['acc'])) print('CR [Dev:%.1f/Test:%.1f]' %
def main(arguments): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Logistics parser.add_argument("--gpu_id", help="gpu id to use", type=int, default=0) parser.add_argument("--seed", help="Random seed", type=int, default=19) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=0) parser.add_argument("--out_dir", help="Dir to write preds to", type=str, default='') parser.add_argument("--log_file", help="File to log to", type=str, default='') parser.add_argument("--load_data", help="0 to read data from scratch", type=int, default=1) # Model options parser.add_argument("--batch_size", help="Batch size to use", type=int, default=16) parser.add_argument("--model_dir", help="path to model folder") parser.add_argument("--prefix1", help="prefix to model 1", default='nli_large_bothskip_parse') parser.add_argument("--prefix2", help="prefix to model 2", default='nli_large_bothskip') parser.add_argument("--word_vec_file", help="path to pretrained vectors") parser.add_argument("--strategy", help="Approach to create sentence embedding last/max/best", choices=["best", "max", "last"], default="best") # Task options parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str) parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40) # Classifier options parser.add_argument("--cls_batch_size", help="Batch size to use for the classifier", type=int, default=16) args = parser.parse_args(arguments) logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) log_file = os.path.join(args.out_dir, "results.log") fileHandler = logging.FileHandler(log_file) logging.getLogger().addHandler(fileHandler) logging.info(args) torch.cuda.set_device(args.gpu_id) # Set up SentEval params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': args.use_pytorch, 'kfold': 10, 'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size, 'load_data': args.load_data, 'seed': args.seed} params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': args.cls_batch_size, 'tenacity': 5, 'epoch_size': 4, 'cudaEfficient': True} # Load model # import GenSen package sys.path.insert(0, args.model_dir) from gensen import GenSen, GenSenSingle ckpt_dir = os.path.join(args.model_dir, "data", "models") gensen_1 = GenSenSingle(model_folder=ckpt_dir, filename_prefix=args.prefix1, pretrained_emb=args.word_vec_file, cuda=bool(args.gpu_id >= 0)) gensen_2 = GenSenSingle(model_folder=ckpt_dir, filename_prefix=args.prefix2, pretrained_emb=args.word_vec_file, cuda=bool(args.gpu_id >= 0)) gensen = GenSen(gensen_1, gensen_2) global STRATEGY STRATEGY = args.strategy params_senteval['gensen'] = gensen # Do SentEval stuff se = senteval.engine.SE(params_senteval, batcher, prepare) tasks = get_tasks(args.tasks) results = se.eval(tasks) write_results(results, args.out_dir) logging.info(results)
class Seq2SeqGAN: def __init__(self, dataset, device): self.dataset = dataset self.clip = 5 self.latent_size = 2048 self.decoder_hidden_size = 2048 self.decoder_layers = 2 self.noise = Normal(torch.tensor([0.0], requires_grad=False), torch.tensor([0.12], requires_grad=False)) self.encoder = GenSenSingle( model_folder='./data/models', filename_prefix='nli_large_bothskip', pretrained_emb='./data/embedding/glove.840B.300d.h5', cuda=torch.cuda.is_available()) vocab_size = len(self.encoder.encoder.src_embedding.weight) self.encoder.encoder.to(device) self.decoder = Decoder(self.decoder_hidden_size, self.latent_size, vocab_size, self.decoder_layers, device=device, clip=5).to(device) weight_mask = torch.ones(vocab_size).cuda() weight_mask[self.encoder.word2id['<pad>']] = 0 self.criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda() self.bce = nn.BCELoss() self.device = device self.embedding_norms = torch.norm( self.encoder.encoder.src_embedding.weight, 1) print(self.decoder) def print_step(self, input_tensor, lengths, decoder_outputs, losses, epoch, i): # print out a medium sized tweet mid = int(BATCH_SIZE / 2) input_to_print = input_tensor[mid, :lengths[mid]].view(-1) output_to_print = decoder_outputs[mid, :lengths[mid], :] input_text = ' '.join( [self.encoder.id2word[int(i)] for i in input_to_print]) output_text = self.unembed(output_to_print) print('{0:d} {1:d} l1: {2:.10f} l2: {3:.10f}'.format( epoch, i * BATCH_SIZE, losses[0], losses[1])) print(input_text) print(output_text) print(' ', flush=True) def get_loss(self, cropped_input, lengths, decoder_outputs, stops): l1 = self.criterion( decoder_outputs.contiguous().view(-1, decoder_outputs.size(2)), cropped_input.contiguous().view(-1)) ideal_stops = torch.zeros_like(stops) for i, l in enumerate(lengths): if l <= ideal_stops.size(1): ideal_stops[i, l - 1] = 1 l2 = self.bce(stops, ideal_stops) return l1, l2 def train_step(self, input_tensor, lengths, optimizer_gen, word_dropout_rate): optimizer_gen.zero_grad() encoder_outputs, encoder_hidden, embedded_input, lengths = self.encoder.get_representation_and_embedded_input( input_tensor, pool='last', return_numpy=False, tokenize=True) encoder_outputs = encoder_outputs.detach() encoder_hidden = encoder_hidden.detach() embedded_input = embedded_input.detach() lengths = lengths.to(device) lengths = lengths.detach() noise = self.noise.sample( encoder_hidden.size()).view_as(encoder_hidden).to(self.device) encoder_hidden += noise decoder_outputs, stops = self.decoder.forward(encoder_outputs, encoder_hidden, word_dropout_rate) # resize input to match decoder output (due to pre-empting decoder) cropped_input = embedded_input[:, :decoder_outputs.size(1)] l1, l2 = self.get_loss(cropped_input, lengths, decoder_outputs, stops) loss_gen = l1 + l2 loss_gen.backward() optimizer_gen.step() losses = np.array([l1.item(), l2.item()]) return losses, decoder_outputs.data, embedded_input.data, lengths.data def validation_step(self, input_tensor, lengths): encoder_outputs, encoder_hidden, embedded_input, lengths = self.encoder.get_representation_and_embedded_input( input_tensor, pool='last', return_numpy=False, tokenize=True) encoder_outputs = encoder_outputs.detach() encoder_hidden = encoder_hidden.detach() embedded_input = embedded_input.detach() lengths = lengths.to(device) lengths = lengths.detach() decoder_outputs, stops = self.decoder.forward(encoder_outputs, encoder_hidden, 0) # resize input to match decoder output (due to pre-empting decoder) cropped_input = embedded_input[:, :decoder_outputs.size(1)] l1, l2 = self.get_loss(cropped_input, lengths, decoder_outputs, stops) losses = np.array([l1.item(), l2.item()]) return losses, decoder_outputs.data, embedded_input.data, lengths.data def train(self, optimizer_gen, epochs=1, print_every=500, validate_every=50000, word_dropout_rate=0.0, best_validation_loss=np.inf, start_at=0): print('USING: {}'.format(self.device)) validations_since_best = 0 for epoch in range(epochs): print_total = np.array([0.0] * 2) for i in range(1500000): input_tensor, lengths = self.dataset[0] lengths = lengths.to(device) if len(input_tensor) != BATCH_SIZE: break samples_processed = (epoch * BATCH_SIZE * 1500000) + ( (i + 1) * BATCH_SIZE) + start_at losses, decoder_outputs, embedded_input, lengths = self.train_step( input_tensor, lengths, optimizer_gen, word_dropout_rate) print_total += losses if i > 0 and i % print_every == 0: print_total /= print_every self.print_step(embedded_input, lengths, decoder_outputs, print_total, epoch, i) for y, l in zip(print_total, ['reconstruction', 'stops']): vis.line(X=np.array([int(samples_processed)]), Y=np.array([[float(y)]]), win=l, opts=dict(title=l, xlabel='samples processed', ylabel='loss', legend=['train']), update='append') print_total *= 0 if i > 0 and i % validate_every == 0: val = self.validate(print_every, samples_processed) vis.line(X=np.array([int(samples_processed)]), Y=np.array([[float(val / len(validation_loader))] ]), win='validation', opts=dict(title="validation", xlabel='samples processed', ylabel='loss', legend=['val']), update='append') if val < best_validation_loss: best_validation_loss = val validations_since_best = 0 save_checkpoint(self.decoder, optimizer_gen, samples_processed, best_validation_loss) else: validations_since_best += 1 print("{} SINCE LAST BEST VALIDATION".format( validations_since_best)) if validations_since_best >= 100: return del input_tensor del decoder_outputs del embedded_input def validate(self, print_every, samples_processed): print("VALIDATING") print_total = np.array([0.0] * 2) with torch.no_grad(): for i in range(5000): input_tensor, lengths = self.dataset[0] # for (i, [input_tensor, lengths]) in enumerate(validation_loader): if len(input_tensor) != BATCH_SIZE: break lengths = lengths.to(device) losses, decoder_outputs, embedded_input, lengths = self.validation_step( input_tensor, lengths) print_total += losses if i > 0 and i % print_every == 0: self.print_step(embedded_input, lengths, decoder_outputs, print_total / i, 0, i) del input_tensor del decoder_outputs del embedded_input print("AVERAGE VALIDATION LOSS: {}".format( float(print_total[0]) / len(validation_loader))) return float(print_total[0]) # def unembed(self, decoder_outputs, length=MAX_LENGTH): # # indices = [int(torch.argmax( # torch.mm(self.encoder.encoder.src_embedding.weight, # torch.unsqueeze(d, 1)[:EMBEDDING_SIZE]) # / self.embedding_norms # )) for d in decoder_outputs] # return ' '.join([self.encoder.id2word[i] for i in indices]) def unembed(self, decoder_outputs, length=MAX_LENGTH): indices = [int(torch.argmax(d)) for d in decoder_outputs] # indices = [int(torch.argmin( # torch.norm(self.encoder.encoder.src_embedding.weight - d[:EMBEDDING_SIZE], dim=1) # )) for d in decoder_outputs] return ' '.join([self.encoder.id2word[i] for i in indices])
if i % 10 == 0: print("%d sentences done" % (i)) # print("Adv embeddings shape: %s, adv_labels shape", len(sent_adv_embeddings), dim(adv_labels[i])) print("Adv embeddings shape: %s, adv_labels shape %s" % (len(adv_embeddings), len(adv_labels))) for i in range(0, len(adv_embeddings), 10): print("Adv embeddings shape: %s, adv_labels shape", len(adv_embeddings[i]), len(adv_labels[i])) return adv_embeddings, adv_labels, adv_batch_sentences # Load GenSen model gensen_1 = GenSenSingle(model_folder='../data/models', filename_prefix='nli_large_bothskip', pretrained_emb='fasttext/glove.840B.300d.h5') gensen_2 = GenSenSingle(model_folder='../data/models', filename_prefix='nli_large_bothskip_parse', pretrained_emb='fasttext/glove.840B.300d.h5') gensen_encoder = GenSen(gensen_1, gensen_2) # reps_h, reps_h_t = gensen_encoder.get_representation( # sentences, pool='last', return_numpy=True, tokenize=True # ) # Set params for SentEval params_senteval = { 'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5,
class SemanticAnalyser(object): """Class for comparing sentences for entailment """ def __init__(self): """Initalizes object """ self.__encoder = GenSenSingle( model_folder=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models'), filename_prefix='nli_large', pretrained_emb=os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'embedding', 'glove.840B.300d.h5') ) with open(os.path.join(os.path.dirname(__file__), 'GenSen', 'data', 'models', 'senteval.pickle'), 'rb') as file: self.__evaluator = pickle.load(file) self.__mutex = Lock() def get_entailments_with_levels(self, sentence, sentences): """Analyzes relation between a sentence and all in a collection Args: sentence: a sentence sentences: a non-empty list of sentences Returns: entailment: 0 if entailed, 1 if neutral, 2 if contradicting for each element in sentences level: a non-negative value of how much this sentence is entailed with each element in sentences """ self.__mutex.acquire() _, encoded = self.__encoder.get_representation([sentence] + sentences, pool='last', return_numpy=True, tokenize=True) input = np.concatenate(( np.repeat([encoded[0]], len(sentences), axis=0), encoded[1:], (np.repeat([encoded[0]], len(sentences), axis=0)) * encoded[1:]), axis=1) output = self.__model_predict(input) self.__mutex.release() entailment = np.argmax(output, axis=1) level = np.max(output, axis=1) - np.transpose(output)[1] for i, sent in enumerate(sentences): if sentence == sent: entailment[i] = 0 level[i] = 1e10 return entailment, level def get_entailment(self, sentence1, sentence2): """Analyzes relation between two sentences Args: sentence1: first sentence as a string sentence2: second sentence as a string Returns: 0 if entailed, 1 if neutral, 2 if contradicting """ if sentence1 == sentence2: return 0 self.__mutex.acquire() _, encoded = self.__encoder.get_representation([sentence1, sentence2], pool='last', return_numpy=True, tokenize=True) input = np.concatenate((encoded[0], encoded[1], encoded[0] * encoded[1])) output = self.__model_predict(np.array([input])) self.__mutex.release() return np.argmax(output) def __model_predict(self, input): sentence_size = input.shape[1] // 3 batch_size = input.shape[0] switched_input = np.hstack((input[:, sentence_size:2*sentence_size], input[:, 0:sentence_size], input[:, 2*sentence_size:3*sentence_size])) input = np.vstack((input, switched_input)) self.__evaluator.model.eval() input = torch.FloatTensor(input).cuda() yhat = [] with torch.no_grad(): for i in range(0, len(input), self.__evaluator.batch_size): x = input[i:i + self.__evaluator.batch_size] output = self.__evaluator.model(x) yhat.append(output.data.cpu().numpy()) yhat = np.vstack(yhat) yhat = (yhat[0:batch_size, :] + yhat[batch_size:2*batch_size, :]) / 2 return yhat
reps_h, reps_h_t = gensen_1.get_representation(list_mystr, pool='last', return_numpy=True, tokenize=True) vectors = reps_h_t.tolist() return vectors @app.route('/get_embeddings/', methods=['POST']) def home(): sentences_list = list(request.json['sentences_list']) sentences_list = [ x.lower().encode("unicode_escape").decode("utf8") for x in sentences_list ] if (not sentences_list): return "Arg \"sentences_list\", not found" vec = embeddings(sentences_list) # print(type(vec), len(vec)) return jsonify(vectors=vec) gensen_1 = GenSenSingle( model_folder='gensen/data/models', filename_prefix='nli_large_bothskip', pretrained_emb='gensen/data/embedding/glove.840B.300d.h5') if __name__ == '__main__': app.run(host='0.0.0.0', port=7654)
class Recommender(nn.Module): def __init__( self, train_vocab, n_movies, params, ): super(Recommender, self).__init__() self.params = params self.train_vocab = train_vocab self.n_movies = n_movies self.cuda_available = torch.cuda.is_available() # instantiate the gensen module that will be used in the encoder HRNN, and by the recommender module self.gensen = GenSenSingle( model_folder=os.path.join(config.MODELS_PATH, 'GenSen'), filename_prefix='nli_large', pretrained_emb=os.path.join(config.MODELS_PATH, 'embeddings/glove.840B.300d.h5'), cuda=self.cuda_available) self.gensen.vocab_expansion(list(train_vocab)) # HRNN encoder # Conversation encoder not bidirectional self.encoder = HRNN(params=params['hrnn_params'], train_vocabulary=train_vocab, gensen=self.gensen, train_gensen=False, conv_bidirectional=False) self.recommender_module = RecommendFromDialogue( params=params['recommend_from_dialogue_params'], train_vocab=train_vocab, n_movies=n_movies, gensen=self.gensen, ) if params['language_aware_recommender']: self.language_to_user = nn.Linear( in_features=params['hrnn_params'] ['conversation_encoder_hidden_size'], out_features=self.recommender_module.autorec. user_representation_size) # latent variable distribution parameters: latent_layer_sizes = params['latent_layer_sizes'] if latent_layer_sizes is not None: latent_variable_size = latent_layer_sizes[-1] self.prior_hidden_layers = nn.ModuleList([ nn.Linear(in_features=params['hrnn_params'] ['conversation_encoder_hidden_size'], out_features=latent_layer_sizes[0]) if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1], out_features=latent_layer_sizes[i]) for i in range(len(latent_layer_sizes) - 1) ]) penultimate_size = params['hrnn_params']['conversation_encoder_hidden_size'] \ if len(latent_layer_sizes) == 1 else latent_layer_sizes[-2] self.mu_prior = nn.Linear(penultimate_size, latent_variable_size) self.sigma_prior = nn.Linear(penultimate_size, latent_variable_size) # context size + size of sentence representations posterior_input_size = params['hrnn_params']['conversation_encoder_hidden_size'] +\ 2 * params['hrnn_params']['sentence_encoder_hidden_size'] + 1 self.posterior_hidden_layers = nn.ModuleList([ nn.Linear(in_features=posterior_input_size, out_features=latent_layer_sizes[0]) if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1], out_features=latent_layer_sizes[i]) for i in range(len(latent_layer_sizes) - 1) ]) penultimate_size = posterior_input_size if len( latent_layer_sizes) == 1 else latent_layer_sizes[-2] self.mu_posterior = nn.Linear(penultimate_size, latent_variable_size) self.sigma_posterior = nn.Linear(penultimate_size, latent_variable_size) context_size = params['hrnn_params'][ 'conversation_encoder_hidden_size'] if latent_layer_sizes is not None: context_size += latent_layer_sizes[-1] self.decoder = SwitchingDecoder(context_size=context_size, vocab_size=len(train_vocab), **params['decoder_params']) if self.cuda_available: self.cuda() self.decoder.set_pretrained_embeddings( self.encoder.gensen.encoder.src_embedding.weight.data) def reparametrize(self, mu, logvariance): """ Sample the latent variable :param mu: :param logvar: :return: """ std = torch.exp(0.5 * logvariance) tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor eps = Variable(torch.randn(std.data.shape, out=tt())) return mu + eps * std def forward(self, input_dict, return_latent=False): # encoder result: (batch_size, max_conv_length, conversation_encoder_hidden_size) conversation_representations, sentence_representations = self.encoder( input_dict, return_all=True, return_sentence_representations=True) batch_size, max_conversation_length, max_utterance_length = input_dict[ "dialogue"].data.shape # get movie_recommendations (batch, max_conv_length, n_movies) if self.params['language_aware_recommender']: user_rep_from_language = self.language_to_user( conversation_representations) movie_recommendations = self.recommender_module( dialogue=input_dict["dialogue"], senders=input_dict["senders"], lengths=input_dict["lengths"], conversation_lengths=input_dict["conversation_lengths"], movie_occurrences=input_dict["movie_occurrences"], recommend_new_movies=False, user_representation=user_rep_from_language if self.params['language_aware_recommender'] else None) # TODO: only decode recommender's utterances # Decoder: utterances = input_dict["dialogue"].view( batch_size * max_conversation_length, -1) lengths = input_dict["lengths"] # order by descending utterance length lengths = lengths.reshape((-1)) sorted_lengths, sorted_idx, rev = sort_for_packed_sequence( lengths, cuda=self.cuda_available) sorted_utterances = utterances.index_select(0, sorted_idx) # shift the context vectors one step in time tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor pad_tensor = (Variable( torch.zeros( batch_size, 1, self.params['hrnn_params']['conversation_encoder_hidden_size'], out=tt()))) conversation_representations = torch.cat( (pad_tensor, conversation_representations), 1).narrow(1, 0, max_conversation_length) # and reshape+reorder the same way as utterances conversation_representations = conversation_representations.contiguous().view( batch_size * max_conversation_length, self.params['hrnn_params']['conversation_encoder_hidden_size'])\ .index_select(0, sorted_idx) # shift the movie recommendations one step in time pad_tensor = (Variable( torch.zeros(batch_size, 1, self.n_movies, out=tt()))) movie_recommendations = torch.cat((pad_tensor, movie_recommendations), 1).narrow(1, 0, max_conversation_length) # and reshape+reorder movie_recommendations the same way as utterances movie_recommendations = movie_recommendations.contiguous().view( batch_size * max_conversation_length, -1).index_select(0, sorted_idx) # consider only lengths > 0 num_positive_lengths = np.sum(lengths > 0) sorted_utterances = sorted_utterances[:num_positive_lengths] sorted_lengths = sorted_lengths[:num_positive_lengths] conversation_representations = conversation_representations[: num_positive_lengths] movie_recommendations = movie_recommendations[:num_positive_lengths] # Latent variable if self.params['latent_layer_sizes'] is not None: # remember that conversation_representations have been shifted one step in time h_prior = conversation_representations for layer in self.prior_hidden_layers: h_prior = F.relu(layer(h_prior)) mu_prior = self.mu_prior(h_prior) logvar_prior = self.sigma_prior(h_prior) # posterior conditioned on current context, and representation of the next utterance (that is the # utterance about to be decoded) # reshape sentence representations the same way as utterances sentence_representations = sentence_representations.view( batch_size * max_conversation_length, 2 * self.params['hrnn_params']['sentence_encoder_hidden_size'] + 1).index_select(0, sorted_idx) sentence_representations = sentence_representations[: num_positive_lengths] h_posterior = torch.cat( (conversation_representations, sentence_representations), 1) for layer in self.posterior_hidden_layers: h_posterior = F.relu(layer(h_posterior)) mu_posterior = self.mu_posterior(h_posterior) logvar_posterior = self.sigma_posterior(h_posterior) # In training, sample from the posterior distribution. At test time, sample from prior. mu, logvar = (mu_posterior, logvar_posterior) if self.training else ( mu_prior, logvar_prior) z = self.reparametrize(mu, logvar) context = torch.cat((conversation_representations, z), 1) else: context = conversation_representations # Run decoder outputs = self.decoder(sorted_utterances, sorted_lengths, context, movie_recommendations, log_probabilities=True, sample_movies=False) # Complete the missing sequences (of length 0) if num_positive_lengths < batch_size * max_conversation_length: tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor pad_tensor = Variable( torch.zeros(batch_size * max_conversation_length - num_positive_lengths, max_utterance_length, len(self.train_vocab) + self.n_movies, out=tt())) outputs = torch.cat((outputs, pad_tensor), 0) # print("OUTPUT SHAPE :", outputs.data.shape) # (batch * max_conv_len, max_sentence_len, vocab + n_movie) # retrieve original order outputs = outputs.index_select(0, rev). \ view(batch_size, max_conversation_length, max_utterance_length, -1) # print("OUTPUT SHAPE RETRIEVED IN ORDER", outputs.data.shape) # (batch, max_conv_len, max_sentence_len, vocab + n_movie) if return_latent: if self.params['latent_layer_sizes'] is None: raise ValueError( "Model has no latent variable, cannot return latent parameters." ) return outputs, mu_prior, logvar_prior, mu_posterior, logvar_posterior return outputs def train_iter(self, batch, criterion, kl_coefficient=1): self.train() if self.params['latent_layer_sizes'] is not None: outputs, mu_prior, logvar_prior, mu_posterior, logvar_posterior = self.forward( batch, return_latent=True) else: outputs = self.forward(batch, return_latent=False) batch_size, max_conv_length, max_seq_length, vocab_size = outputs.data.shape # indices of recommender's utterances(< batch * max_conv_len) idx = Variable( torch.nonzero((batch["senders"].view(-1) == -1).data).squeeze()) # select recommender's utterances for the loss outputs = outputs.view(-1, max_seq_length, vocab_size).index_select(0, idx) target = batch["target"].view(-1, max_seq_length).index_select(0, idx) loss = criterion(outputs.view(-1, vocab_size), target.view(-1)) # variational loss = KL(posterior || prior) if self.params['latent_layer_sizes'] is not None: # for two normal distributions, kld(p1, p2) = # log(sig2 / sig1) + (sig1^2 + (mu1-mu2)^2) / (2 sig2^2) - 1/2 # multivariate: (sig1 and sig2 the covariance matrices) # .5 * (tr(sig2^-1 sig1) + (mu2-mu1)T sig2^-1 (mu2-mu1) - k + ln(det(sig2) / det(sig1)) # in the case where sig1 and sig2 are diagonal: # .5 * sum(sig1^2 / sig2^2 + (mu2-mu1)^2 / sig2^2 - 1 + ln(sig2^2) - ln(sig1^2)) kld = .5 * ( -1 + logvar_prior - logvar_posterior + (torch.exp(logvar_posterior) + (mu_posterior - mu_prior).pow(2)) / torch.exp(logvar_prior)) kld = torch.mean(torch.sum(kld, -1)) # print("NLL loss {} KLD {}".format(loss.data, kld.data)) loss += kl_coefficient + kld # backward pass loss.backward() return loss.data[0] def evaluate(self, batch_loader, criterion, subset="valid"): """ Evaluate function :param subset: in {"valid", "train"}. Susbet on which to evaluate :return: the mean loss. """ self.eval() batch_loader.batch_index[subset] = 0 n_batches = batch_loader.n_batches[subset] losses = [] for _ in tqdm(range(n_batches)): # load batch batch = batch_loader.load_batch(subset=subset) if self.cuda_available: batch["dialogue"] = batch["dialogue"].cuda() batch["target"] = batch["target"].cuda() batch["senders"] = batch["senders"].cuda() # compute output and loss outputs = self.forward(batch) batch_size, max_conv_length, max_seq_length, vocab_size = outputs.data.shape # indices of recommender's utterances(< batch * max_conv_len) idx = Variable( torch.nonzero( (batch["senders"].view(-1) == -1).data).squeeze()) # select recommender's utterances for the loss outputs = outputs.view(-1, max_seq_length, vocab_size).index_select(0, idx) target = batch["target"].view(-1, max_seq_length).index_select(0, idx) loss = criterion(outputs.view(-1, vocab_size), target.view(-1)) losses.append(loss.data[0]) print("{} loss : {}".format(subset, np.mean(losses))) self.train() return np.mean(losses)
""" import sys import json import h5py import numpy as np DATA_PATH = '/hdd/robik/CLEVR' GENSEN_PATH = '/hdd/robik/projects/gensen' sys.path.append(f'{GENSEN_PATH}') from gensen import GenSen, GenSenSingle gensen_1 = GenSenSingle( model_folder=f'{GENSEN_PATH}/data/models', filename_prefix='nli_large_bothskip', cuda=True, pretrained_emb=f'{GENSEN_PATH}/data/embedding/glove.840B.300d.h5') for split in ['train', 'val']: feat_h5 = h5py.File(f'{DATA_PATH}/questions_{split}_clevr.h5', 'w') ques = json.load( open(f'{DATA_PATH}/questions/CLEVR_{split}_questions.json')) ques = ques['questions'] questions = [q['question'] for q in ques] qids = [q['question_index'] for q in ques] qids = np.int64(qids) dt = h5py.special_dtype(vlen=str) feat_h5.create_dataset('feats', (len(qids), 2048), dtype=np.float32) feat_h5.create_dataset('qids', (len(qids), ), dtype=np.int64) feat_h5.create_dataset('questions', (len(qids), ), dtype=dt)
help="Path to pretrained embeddings", default= '/data/milatmp1/subramas/embeddings/new_glove.840B.300d.h5', # (Don't mess with this) ) args = parser.parse_args() batch_size = 20000 hidden_size = 2048 max_length = 100 data_file = args.train_filename iterator = SentenceIterator(data_file, vocab_size=80000, max_length=max_length) model = GenSenSingle(model_folder=args.folder_path, filename_prefix=args.prefix, pretrained_emb=args.pretrain, cuda=True) iterator.word2id = model.word2id iterator.id2word = model.id2word model.vocab_expansion(model.id2word.values()) sentences = iterator.lines if batch_size is 'all' else iterator.lines[ 0:batch_size] sentences = [' '.join(s[:max_length]) for s in sentences] repr_last_h = np.empty((0, hidden_size)) for mbatch_idx, mbatch in enumerate(range(0, len(sentences), 200)): less_sentences = sentences[mbatch:mbatch + 200] _, last_h = model.get_representation(less_sentences, pool='last', return_numpy=True, tokenize=False) repr_last_h = np.append(repr_last_h, last_h, axis=0)