from scipy.spatial import distance from nltk.corpus import stopwords from sklearn.metrics.pairwise import cosine_similarity import pyemd import pickle import pathlib import os stop_words = stopwords.words("english") # Initialize GloVe embeddings print("Loading GloVe embeddings...") glove_embeddings = gensim.downloader.load("glove-wiki-gigaword-300") # Load corpus corpus = utils.load_corpus() # this function returns the average vector of a sentence def get_vec(tokenized_sentence): return np.mean(np.array([glove_embeddings[word] for word in tokenized_sentence]), axis=0) def calculate_similarity(cc_embedding, tokenized): similarity_scores = [] for article in tokenized: current_article = [] file = article[0] for tokenized_sentence in article[1]: sentence_embedding = get_vec(tokenized_sentence) score = distance.cosine(cc_embedding, sentence_embedding) if score > 1.0 : score = 1.0
if args.n_basis_kb < 0: args.n_basis_kb = args.n_basis # Set the random seed manually for reproducibility. seed_all_randomness(args.seed, args.cuda, randomness=args.randomness) ######################## print("Loading data") ######################## device = torch.device("cuda" if args.cuda else "cpu") idx2word_freq, target_idx2word_freq, dataloader_train_arr, dataloader_val, dataloader_val_shuffled, max_sent_len = load_corpus( args.data, args.batch_size, args.batch_size, device, skip_training=args.skip_train, want_to_shuffle_val=False) dataloader_train = dataloader_train_arr[0] kb_rels = [] kb_markers = [] kb_num_basis = [] kb_rel_idx = set() # kb relation index to no. of bases mapping kbidx2num_basis = defaultdict(int) for batch in tqdm(dataloader_train): feature, target, kb_marker, num_basis = batch # indices of kb relations kb_indices = (kb_marker == 1).nonzero().flatten()
f_log.write(s + '\n') # Set the random seed manually for reproducibility. seed_all_randomness(args.seed, args.cuda, randomness=args.randomness) logging('Args: {}'.format(args)) ######################## print("Loading data") ######################## device = torch.device("cuda" if args.cuda else "cpu") idx2word_freq, target_idx2word_freq, dataloader_train_arr, dataloader_val, dataloader_val_shuffled, max_sent_len = \ load_corpus(args.data, args.batch_size, args.batch_size, device, args.tensor_folder, args.training_file, args.training_split_num, args.copy_training, skip_val= args.skip_val) def counter_to_tensor(idx2word_freq, device, rare, smooth_alpha=0): total = len(idx2word_freq) w_freq = torch.zeros(total, dtype=torch.float, device=device, requires_grad=False) for i in range(total): # w_freq[i] = math.sqrt(idx2word_freq[x][1]) if rare: if i == 0: print("Emphasizing on the RARE") if smooth_alpha == 0: if i == 0: print("No alpha-smoothing") w_freq[i] = idx2word_freq[i][1]