def load_inferSent(sentences): logger.info('load InferSent') V = 2 MODEL_PATH = 'Infersent/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) if torch.cuda.is_available(): infersent.cuda() # set word vector if V == 1: W2V_PATH = 'Infersent/Glove/glove.840B.300d.txt' logger.warning('Use Glove Embedding') elif V == 2: W2V_PATH = 'Infersen/fastText/crawl-300d-2M.vec' logger.warning('Use fastText Embedding') else: raise NotImplementedError infersent.set_w2v_path(W2V_PATH) # build voceb infersent.build_vocab(sentences, tokenize=True) return infersent
def get_loaded_model(force_gpu=False, k_most_frequent_words=1000000): model_path = "infersent/encoder/infersent{}.pkl".format(model_version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) if (not torch.cuda.is_available()) and force_gpu: raise GPUNotFoundException() if torch.cuda.is_available(): model = model.cuda() # If infersent1 -> use GloVe embeddings. # If infersent2 -> use InferSent embeddings. W2V_PATH = 'infersent/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'infersent/dataset/fastText/crawl-300d-2M.vec' ## noqa model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=k_most_frequent_words) return model
def create_embeddings(infer_path, data_path, em_type): yt_titles = yt.get_yt_titles() with open("data/whtitles", "r") as f: wh_titles = [line.rstrip('\n') for line in f] if em_type == "yt": # Youtube save_f = os.path.join(data_path, "yt_embed") titles = yt_titles elif em_type == "wh": # Wikihow save_f = os.path.join(data_path, "wh_embed") titles = wh_titles else: raise "Unknown embedding type: {}".format(em_type) nltk.download('punkt') V = 1 MODEL_PATH = os.path.join(infer_path, 'encoder/infersent%s.pkl' % V) params_model = { 'bsize': 256, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent = infersent.cuda() W2V_PATH = os.path.join(infer_path, 'GloVe/glove.840B.300d.txt') infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(yt_titles + wh_titles, tokenize=True) embed = infersent.encode(titles, tokenize=True) np.save(save_f, embed)
def embed_dataset(dataset_path, infersent_path, force_cpu=False): """ To make this work, first run ./get_infersent.sh """ MODEL_PATH = infersent_path / "encoder/infersent1.pkl" params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1} model = InferSent(params_model) if force_cpu: model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu')) else: model.load_state_dict(torch.load(MODEL_PATH)) model.cuda() W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) csv_data = read_csv(dataset_path / 'train.csv') csv_data = csv_data[1:] # skip header data = defaultdict(list) for irow, row in enumerate(csv_data): if 'snips' in str(dataset_path): utterance, labels, delexicalised, intent = row else: raise TypeError( "Unknown dataset type. Implement your own first. See the " "README") data[intent].append(utterance) vectors = {} for i, (intent, sentences) in enumerate(data.items()): print('{}/{} done'.format(i, len(data.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
def apply_logician(s1, s2 , is_list=False, sick_model = False): # is_list : If you are directly sending sentences then keep is_list = False # If you are sending list of list of words then keep is_list = True # sick_model: if True, will use sick model for prediction # : if False, will use snli model for prediction # Load InferSent model params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(PATH_TO_W2V) params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 'tenacity': 3, 'epoch_size': 2} params_senteval['infersent'] = model.cuda() if not is_list: s1 = convert_str2lst(s1) s2 = convert_str2lst(s2) samples = s1+s2 params_senteval['batch_size'] = min(128,len(s1)) params_senteval = utils.dotdict(params_senteval) params_senteval.usepytorch = True prepare(params_senteval, samples) emb_s1 = batcher(params_senteval, s1) emb_s2 = batcher(params_senteval, s2) if sick_model: testF = np.c_[ np.abs(emb_s1 - emb_s2),emb_s1 * emb_s2] cp = torch.load('./saved_sick.pth') print('[Contradiction Neutral Entailment]') else: testF = np.c_[emb_s1, emb_s2, emb_s1 * emb_s2, np.abs(emb_s1 - emb_s2)] cp = torch.load('./saved_snli_augment_ordered.pth') print('[ Entailment Neutral Contradiction ]') inputdim = testF.shape[1] nclasses = 3 clf = nn.Sequential(nn.Linear(inputdim, nclasses),).cuda() clf.load_state_dict(cp) testF = torch.FloatTensor(testF).cuda() out = clf(testF) sf = nn.Softmax(1) probs = sf(out) return probs
def init_models(vocal_size: int = VOCAB_SIZE): model = InferSent({ 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': VERSION }) model.load_state_dict(torch.load(MODEL_PATH)) model = model.cuda() if USE_CUDA else model model.set_w2v_path(VECTOR_PATH) model.build_vocab_k_words(K=VOCAB_SIZE) return model
def infersent_embeddings(): train_data_list = [] test_data_list = [] sys.path.append( '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master') # Load model from models import InferSent model_version = 1 MODEL_PATH = "/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/glove.840B.300d-003.txt' if model_version == 1 else '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) train_data_list = model.encode(final_train['text'].tolist(), bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(train_data_list))) test_data_list = model.encode(final_test['text'].tolist(), bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(test_data_list))) return train_data_list, test_data_list
def infersent_glove(): #Set Model for InferSent+Glove V = 1 MODEL_PATH = '/tmp/GloVe/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } modelg = InferSent(params_model) modelg.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = True modelg = modelg.cuda() if use_cuda else modelg # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = '/tmp/GloVe/glove.840B.300d.txt' if V == 1 else '/home/ganesh/Quora_dev/tmp/GloVe/glove.840B.300d.txt' modelg.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words modelg.build_vocab_k_words(K=100000) return modelg
def init_infersent_model(self): model_version = 1 MODEL_PATH = "encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) self.model = model
} # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(PATH_TO_W2V) params_senteval['infersent'] = model.cuda() se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion' ] results = se.eval(transfer_tasks) print(results)
args = parser.parse_args() print("download: ", args.download) print("Model: ", args.model_version) print("Makeing cosine vector : ", args.cosine) if args.download == True: nltk.download('punkt') model_version = args.model_version MODEL_PATH = "/home1/InferSent/encoder/infersent%s.pickle" % model_version params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = True model = model.cuda() if use_cuda else model W2V_PATH = '/home1/InferSent/oov_train_model.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words # model.build_vocab_k_words(K=100000) model.build_vocab_k_words(K=2051129) # Extract embedding word . # Load test sentences train_test = pd.read_csv('/home1/InferSent/testset.csv', header=None, delimiter=",", encoding='UTF-8') source_s = train_test[0][1:] target_s = train_test[1][1:] embeddings_source = model.encode(source_s, bsize=128, tokenize=False, verbose=True) print('nb source_s encoded : {0}'.format(len(embeddings_source))) embeddings_target = model.encode(target_s, bsize=128, tokenize=False, verbose=True) print('nb target_s encoded : {0}'.format(len(embeddings_target)))
def main(): # Dictionary for Final Rankings. ranking = dict() print("\n CSI 4107 - Microblog information retrieval system \n") print("\n Importing Query Files and Documents... \n") # Load the tweet list. # {'34952194402811904': 'Save BBC World Service from Savage Cuts http://www.petitionbuzz.com/petitions/savews', ...} tweets_dict = importTweets() # Load the list of queries. # {1: ['bbc', 'world', 'servic', 'staff', 'cut'], ...} queries_dict = importQuery() print("\n Importing Done! \n") print("\n Initializing InferSent Model... \n") # Initialize InferSent Model. infersent = InferSent(params_model) # Load Infersent v1 Model Encoder. infersent.load_state_dict(torch.load(MODEL_PATH)) # Use GPU Mode infersent = infersent.cuda() if USE_CUDA else infersent # Load Pre-trained GloVe Model. infersent.set_w2v_path(W2V_PATH) print("\n InferSent Initialization Done! \n") print("\n Building Vocabulary from Tweets... \n") # Deconstruct the dictionary of Documents to Document ID, and Document Contents. tweets = list(tweets_dict.values()) tweet_ids = list(tweets_dict.keys()) # Deconstruct the dictionary of Queries to Query Contents, since we can replicate Query ID. queries = list(queries_dict.values()) # Build the Infersent Vocabulary based on all the Documents' Contents. infersent.build_vocab(tweets, tokenize=False) print("\n Vocabulary Completed! \n") print("\n Building Document & Query Vectors... \n") doc_embeddings = infersent.encode(tweets, bsize=128, tokenize=False, verbose=True) query_embeddings = infersent.encode(queries, bsize=128, tokenize=False, verbose=True) print("\n Building Document & Query Vectors Done! \n") print("\n Retrieval and Ranking... \n") dranking = dict() for query_id in range(len(queries)): print(dranking) # Encoded array starts at 0 for first chronological document. current_document = 0 # Calculate the Cosine Similarity between the current Query, and corpus of Documents. for tweet_id in tweet_ids: # Calculate the Cossine Sim dranking[tweet_id] = cosine(doc_embeddings[current_document], query_embeddings[query_id]) current_document += 1 # Put the ranking of Documents in Descending order into ranking. ranking[query_id + 1] = { k: v for k, v in sorted(dranking.items(), key=lambda dranking: dranking[1], reverse=True)[:1000] } # Create the resulting file. print("Query " + str(query_id) + " Done.") dranking.clear() resultFileCreation(ranking) print("\n Retrieval and Ranking Done! \n")
import numpy as np import torch from numpy import save from tqdm import tqdm import pandas as pd import json from models import InferSent import argparse V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model = model.cuda() W2V_PATH = "./fastText/crawl-300d-2M.vec" model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) def infersent_embed_doc(rpath, wpath): df = pd.read_csv(rpath,chunksize=1000) text = [] count=0 for chunk in df: text = text + chunk['comment'].tolist() error_idx = [] with open(wpath,'w+') as fw: for i in range(0,len(text)): try:
nlp = spacy.load("en_core_web_sm") MODEL_PATH = "/home/psrivastava/Intern_Summer/infersent/encoder/infersent2.pkl" W2V_PATH = "/home/psrivastava/Intern_Summer/infersent/fastText/crawl-300d-2M.vec" params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent.set_w2v_path(W2V_PATH) use_cuda = True infersent = infersent.cuda() if use_cuda else infersent def get_batch_from_dataframe(currentidx): to_fetch = currentidx + 640 abs_arr = dfs.ix[currentidx:to_fetch, 'clean_text'].tolist() catg_arr = dfs.ix[currentidx:to_fetch, 'category'].tolist() subj_arr = dfs.ix[currentidx:to_fetch, 'set'].tolist() currentidx = currentidx + 640 return abs_arr, catg_arr, subj_arr, title_arr, currentidx def with_stopwords(): pds = pd.DataFrame(columns=['embds', 'set', 'catg'])
# setup logger logger = logging.getLogger(__name__) logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) # load model V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) if torch.cuda.is_available(): infersent.cuda() # set word vector if V == 1: W2V_PATH = 'Glove/glove.840B.300d.txt' logger.info('Use Glove Embedding') elif V ==2 : W2V_PATH = 'fastText/crawl-300d-2M.vec' logger.info('Use fastText Embedding') else: raise NotImplementedError infersent.set_w2v_path(W2V_PATH) # read data refs = [] with open(args.golden, 'r') as f:
parser.add_argument('-c', '--cpu', action='store_true', help='Use CPU instead of GPU.') parser.add_argument('-b', '--batch-size', type=int, default=64, help='Batch size (default: 64)') parser.add_argument('files', nargs='+', help='List of files to extract sentence embeddings') args = parser.parse_args() params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': args.version} model = InferSent(params_model) model.load_state_dict(torch.load(args.model_path)) if not args.cpu: model = model.cuda() model.set_w2v_path(args.w2v_path) # Ensure directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) # Read files and extract features for fpath in args.files: print('Reading file {}'.format(fpath)) sents = [] with open(fpath) as f: for line in f: line = line.strip() assert line, 'Empty line in {}'.format(fpath)
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Logistics parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0) parser.add_argument("--seed", help="Random seed", type=int, default=19) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--out_dir", help="Dir to write preds to", type=str, default='') parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--load_data", help="0 to read data from scratch", type=int, default=1) # Task options parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str) parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40) # Model options parser.add_argument("--model_checkpoint", help="Model checkpoint to use", type=str, default='') parser.add_argument("--word_vec_file", help="Word vector file to use", type=str) parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64) # Classifier options parser.add_argument("--cls_batch_size", help="Batch size to use", type=int, default=64) args = parser.parse_args(arguments) logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) log_file = os.path.join(args.out_dir, "results.log") fileHandler = logging.FileHandler(log_file) logging.getLogger().addHandler(fileHandler) logging.info(args) # define senteval params params_senteval = { 'task_path': PATH_TO_DATA, 'usepytorch': args.use_pytorch, 'kfold': 10, 'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size, 'load_data': args.load_data, 'seed': args.seed } params_senteval['classifier'] = { 'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 'tenacity': 3, 'epoch_size': 2 } # Load InferSent model params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } model = InferSent(params_model) model.load_state_dict(torch.load(args.model_checkpoint)) model.set_w2v_path(args.word_vec_file) params_senteval['infersent'] = model.cuda() # Do SentEval stuff se = senteval.engine.SE(params_senteval, batcher, prepare) tasks = get_tasks(args.tasks) results = se.eval(tasks) write_results(results, args.out_dir) logging.info(results)
} # model encoder_types = [ 'InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder', 'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder', 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder' ] assert params.encoder_type in encoder_types, "encoder_type must be in " + \ str(encoder_types) infersent_net = InferSent(config_nli_model) print(infersent_net) infersent_net.load_state_dict(torch.load('./encoder/infersent1.pkl')) infersent_net.cuda() for parameters_infer in infersent_net.parameters(): parameters_infer.requires_grad = False ae_model = DisEnc.LinearAutoEncoder(params.dis_emb_dim).cuda() print(ae_model) def cos_distance(a, b): return (1. - torch.nn.functional.cosine_similarity(a, b)) def hamming_distance(a, b): #return (a-b).abs().sum()
} nli_net = InferSent(params_model) nli_net.load_state_dict(torch.load(MODEL_PATH)) print(nli_net) # loss weight = torch.FloatTensor(params.n_classes).fill_(1) loss_fn = nn.CrossEntropyLoss(weight=weight) loss_fn.size_average = False # optimizer optim_fn, optim_params = get_optimizer(params.optimizer) optimizer = optim_fn(nli_net.parameters(), **optim_params) # cuda by default nli_net.cuda() loss_fn.cuda() """ TRAIN """ val_acc_best = -1e10 adam_stop = False stop_training = False lr = optim_params['lr'] if 'sgd' in params.optimizer else None def evaluate(epoch, eval_type='valid', final_eval=False): nli_net.eval() correct = 0. global val_acc_best, lr, stop_training, adam_stop
def main(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) task = get_task(args.task_name, args.data_dir) use_cuda = False if args.no_cuda else True verbose = args.verbose # model config config = { 'word_emb_dim': args.word_emb_dim, 'enc_lstm_dim': args.enc_lstm_dim, 'n_enc_layers': args.n_enc_layers, 'dpout_model': args.dpout_model, 'dpout_fc': args.dpout_fc, 'fc_dim': args.fc_dim, 'bsize': args.batch_size, 'n_classes': args.n_classes, 'pool_type': args.pool_type, 'nonlinear_fc': args.nonlinear_fc, 'use_cuda': use_cuda, 'version': args.model_version, 'dropout_prob': args.dropout_prob, } # load model if verbose: print('loading model...') model = InferSent(config) model.load_state_dict(torch.load(args.model_path)) model = model.cuda() if not args.no_cuda else model model.set_w2v_path(args.word_emb_path) model.build_vocab_k_words(K=args.k_freq_words, verbose=verbose) # load classifier classifier = SimpleClassifier(config) classifier = classifier.cuda() if not args.no_cuda else classifier # get train examples train_examples = task.get_train_examples() # calculate t_total t_total = initialization.get_opt_train_steps(len(train_examples), args) # build optimizer. optimizer = optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9) # create running parameters r_params = RunnerParameters( local_rank=args.local_rank, n_gpu=n_gpu, learning_rate=5e-5, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=t_total, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, verbose=verbose) # create runner class for training and evaluation tasks. runner = GlueTaskClassifierRunner(encoder_model=model, classifier_model=classifier, optimizer=optimizer, label_list=task.get_labels(), device=device, rparams=r_params) if args.do_train: runner.run_train_classifier(train_examples) if args.do_val: val_examples = task.get_dev_examples() results = runner.run_val(val_examples, task_name=task.name, verbose=verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False) metrics_str = json.dumps( { "loss": results["loss"], "metrics": results["metrics"] }, indent=2) print(metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(metrics_str) # HACK for MNLI-mismatched if task.name == "mnli": mm_val_example = MnliMismatchedProcessor().get_dev_examples( task.data_dir) mm_results = runner.run_val(mm_val_example, task_name=task.name, verbose=verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"), header=False, index=False) combined_metrics = {} for k, v in results["metrics"].items(): combined_metrics[k] = v for k, v in mm_results["metrics"].items(): combined_metrics["mm-" + k] = v combined_metrics_str = json.dumps( { "loss": results["loss"], "metrics": combined_metrics, }, indent=2) print(combined_metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(combined_metrics_str)
# In[113]: from random import randint import numpy as np import torch from models import InferSent model_version = 1 MODEL_PATH = "/home/anuja/Desktop/BE project/Models/InferSent/infersent1.pkl" params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version} infermodel = InferSent(params_model) infermodel.load_state_dict(torch.load(MODEL_PATH)) use_cuda = False infermodel = infermodel.cuda() if use_cuda else infermodel W2V_PATH = '/home/anuja/Desktop/BE project/glove.6B/glove.840B.300d.txt' #replace with glove.840B.300d.txt infermodel.set_w2v_path(W2V_PATH) infermodel.build_vocab_k_words(K=100000) # In[114]: df = pd.DataFrame(columns=['body','replier', 'thread_no','embeddings']) folder = glob.glob(folder_path) th_no = 0 obj = preprocessing.preprocess() cnt = 0 count_file = 0
def main(): init_output_dir(output_dir) # prepare dataset task = get_task(task_name, dataset_path) label_list = task.get_labels() label_map = {v: i for i, v in enumerate(label_list)} print("loading raw data ... ") train_examples = task.get_train_examples() val_examples = task.get_dev_examples() test_examples = task.get_test_examples() print("converting to data loader ... ") train_loader = get_dataloader(train_examples, label_map) val_loader = get_dataloader(val_examples, label_map) test_loader = get_dataloader(test_examples, label_map) # load model print("loading model ... ") model = InferSent(config) model.load_state_dict(torch.load(model_path)) model = model.cuda() if config['use_cuda'] else model model.set_w2v_path(word_emb_path) print("building model vocabs ... ") model.build_vocab_k_words(K=100000, verbose=True) # run embedding for train set print("Run embedding for train set") for _ in trange(1, desc="Epoch"): run_encoding(loader=train_loader, model=model, mode='train') print("Run embedding for dev set") for _ in trange(1, desc="Epoch"): run_encoding(loader=val_loader, model=model, mode='dev') print("Run embedding for test set") for _ in trange(1, desc="Epoch"): run_encoding(loader=test_loader, model=model, mode='test') # HACK FOR MNLI mis-matched if task_name == 'mnli': print("Run Embedding for MNLI Mis-Matched Datasets") print("loading raw data ... ") mm_val_example = MnliMismatchedProcessor().get_dev_examples(dataset_path) mm_test_examples = MnliMismatchedProcessor().get_test_examples(dataset_path) print("converting to data loader ... ") mm_val_loader = get_dataloader(mm_val_example, label_map) mm_test_loader = get_dataloader(mm_test_examples, label_map) print("Run embedding for mm_dev set") for _ in trange(1, desc="Epoch"): run_encoding(loader=mm_val_loader, model=model, mode='mm_dev') print("Run embedding for test set") for _ in trange(1, desc="Epoch"): run_encoding(loader=mm_test_loader, model=model, mode='mm_test')
V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab_k_words(K=100000) infersent = infersent.cuda() class PoetryDataset(Dataset): def __init__(self, data_dir, split, create_data, **kwargs): super().__init__() self.data_dir = data_dir self.split = split self.max_sequence_length = kwargs.get('max_sequence_length', 50) self.min_occ = kwargs.get('min_occ', 3) self.raw_data_path = os.path.join(data_dir, 'poems.csv') self.data_file = 'poems.{}.json'.format(self.split) self.vocab_file = 'poems.vocab.json' self.categories = [['love', 'relationships', 'marriage'],