def get_text_vocab(texts): word_vocab = Vocabulary() char_vocab = Vocabulary(lower=False) for item in texts: word_vocab.add_documents(item) for words in item: char_vocab.add_documents(words) word_vocab.build() char_vocab.build() return word_vocab, char_vocab
def __init__(self, mode, noisy_for_train, sentiment, direction): self.mode = mode self.root = os.path.join('../data', 'yelp') self.noisy = self.mode == 'train' and noisy_for_train # Load data from domain 0 and domain 1. path = os.path.join(self.root, 'sentiment.{}.{}'.format(mode, sentiment)) # Load vocabulary. print('----- Loading vocab -----') self.vocab = Vocabulary('../data/amazon/amazon.vocab') print('vocabulary size:', self.vocab.size) self.pad = self.vocab.word2id['<pad>'] self.go = self.vocab.word2id['<go>'] self.eos = self.vocab.word2id['<eos>'] self.unk = self.vocab.word2id['<unk>'] # Tokenize file content with open(path, 'r') as f: ids = [] for line in f: words = ['<go>'] + line.split() + ['<eos>'] if direction == 'forward': pass elif direction == 'backward': words.reverse() else: raise ValueError() for word in words: ids.append(self.vocab.word2id[word] if word in self.vocab.word2id else self.unk) self.ids = torch.LongTensor(ids) # (very_long, ) self.ids = batchify(self.ids, config.batch_size, config) # shape = (???, batch_size)
def __init__(self, test_cls, dataset): self.vocab = Vocabulary('../data/{}/{}.vocab'.format(dataset, dataset)) self.Emb = nn.Embedding.from_pretrained(self.vocab.embedding, freeze=False) self.Emb = gpu_wrapper(self.Emb) if test_cls == 'TextCNN': self.C = Discriminator(kernels=config.textCNN_kernels, conv_dim=config.textCNN_conv_dim, dim_h=100, D=2, dropout=config.textCNN_dropout) else: raise ValueError() self.C = gpu_wrapper(self.C) self.train_set, self.test_set, self.val_set = None, None, None self.logger, self.optim, self.best_acc = None, None, 0 self.iter_num = 0 self.lr = config.textCNN_lr self.dataset = dataset self.model_name = test_cls + '-' + dataset self.noisy = True self.total_iters = 200000 self.beta1 = 0.5 self.beta2 = 0.999 self.batch_size = 64 self.num_workers = 8 self.ROUND = 4 self.sample_step = 4000 self.lr_decay_step = 1000 self.num_iters_decay = 0 self.max_len = 20
def __init__( self, data_dir, seq_length, vocab_size=None, vocab=None, training=False, vocab_from_pretrained="bert-base-uncased", do_lower_case=True, ): self.data_dir = data_dir self.seq_length = seq_length self.vocab = Vocabulary() with open(os.path.join(data_dir, "rick_and_morty.txt"), "r", encoding="utf-8") as f: self.text = f.read() if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: self.vocab.add_text(self.text) self.vocab.save(os.path.join(data_dir, "vocab.pkl")) if vocab_size is not None: self.vocab = self.vocab.most_common(vocab_size - 2) self.text = self.vocab.clean_text(self.text) self.tokens = self.vocab.tokenize(self.text)
def __init__( self, data_dir, vocab_size=None, vocab=None, seq_length=40, training=False, vocab_from_pretrained="bert-base-uncased", do_lower_case=True, ): self.data_dir = data_dir self.vocab = Vocabulary(vocab_from_pretrained, do_lower_case) self.seq_length = seq_length data_all = pd.read_csv(os.path.join(self.data_dir, "combined-data.csv"), sep=' ', header=None, encoding="cp1252") data_all[1] = data_all[1] + " " + data_all[2] data_all = data_all[[0, 1]] data_all.columns = ['label', 'text'] data_all = data_all[['text', 'label']] data_all = data_all[~data_all.text.isna()] data_all.label = data_all.label.apply(lambda x: int(x[-1])) data_all.text = data_all.text.apply(lambda x: x.lower()) data_all = data_all.sample(1000) self.train_df = data_all.copy() #pd.DataFrame({"text": [], "label": []}) self.val_df = pd.DataFrame({"text": [], "label": []}) self.test_df = data_all.copy() # pd.DataFrame({"text": [], "label": []}) #data_all.copy() del data_all if training: self.train() if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: self.vocab.add_text( " ".join(pd.concat([self.train_df, self.val_df], sort=False).text.values) ) self.vocab.save(os.path.join(data_dir, "vocab.pkl")) else: self.test() if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: raise(Exception("Vocab file is not specified in test mode!")) if vocab_size is not None: self.vocab = self.vocab.most_common(vocab_size - 2)
def __init__(self): self.root = os.path.join('data', 'ptb') voc_f = os.path.join(self.root, 'ptb.vocab') self.max_len = config.max_len self.sentence_pairs = [] sentences = [] with open(os.path.join(self.root, 'interp_pairs.txt')) as f: for line in f.readlines(): if len(line.strip()) == 0: continue sent = line.strip().split() sentences.append(sent) for i in range(len(sentences)): for j in range(len(sentences)): if i == j: continue self.sentence_pairs.append((sentences[i], sentences[j])) print('PTB Interpolation data successfully read.') # Load vocabulary. print('----- Loading vocab -----') self.vocab = Vocabulary(voc_f) print('vocabulary size:', self.vocab.size) self.pad = self.vocab.word2id['<pad>'] self.go = self.vocab.word2id['<go>'] self.eos = self.vocab.word2id['<eos>'] self.unk = self.vocab.word2id['<unk>']
def build_vocab(anns, threshold=4): """Build a simple vocabulary wrapper.""" counter = Counter() for i, ann in enumerate(anns): # print('Processing {}/{}...'.format(i+1, len(anns))) caption = ann.get('caption') tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(caption.lower()) counter.update(tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def __init__(self, mode): self.mode = mode assert self.mode in ['train', 'valid', 'test'] self.root = os.path.join('data', 'switchboard') voc_f = os.path.join(self.root, 'switchboard.vocab') self.max_len = config.max_len self.posts = [] self.responses = [] with open(os.path.join(self.root, '{}.txt'.format(self.mode))) as f: for line in f.readlines(): if len(line.strip()) == 0: continue post, response = line.strip().split('\t') self.posts.append(post.split()) self.responses.append(response.split()) print('SwitchBoard data successfully read.') # Build vocabulary. if self.mode == 'train': print('----- Building vocab -----') build_vocab(self.posts + self.responses, voc_f, min_occur=5) # TODO # Load vocabulary. print('----- Loading vocab -----') self.vocab = Vocabulary(voc_f) print('vocabulary size:', self.vocab.size) self.pad = self.vocab.word2id['<pad>'] self.go = self.vocab.word2id['<go>'] self.eos = self.vocab.word2id['<eos>'] self.unk = self.vocab.word2id['<unk>']
def __init__(self, data_dir, seq_length, vocab_size, vocab=None): self.df = pd.read_csv(os.path.join(data_dir, 'spam.csv'), encoding="mbcs") self.vocab = Vocabulary() self.labels = [] for x in self.df.v1: if x == 'ham': self.labels.append(0) else: self.labels.append(1) self.seq_length = seq_length if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: self.vocab.add_text(" ".join(self.df["v2"].values)) self.vocab.save(os.path.join(data_dir, "vocab.pkl")) if vocab_size is not None: self.vocab = self.vocab.most_common(vocab_size - 2) self.text = self.vocab.clean_text(" ".join(self.df["v2"].values)) self.tokens = [] for content in self.df["v2"].values: self.tokens.append( self.vocab.tokenize(self.vocab.clean_text(content)))
def __init__(self, mode): self.mode = mode assert self.mode in ['train', 'valid', 'test'] self.root = os.path.join('data', 'yelp') voc_f = os.path.join(self.root, 'yelp.vocab') self.max_len = config.max_len self.sentences = [] with open(os.path.join(self.root, '{}.txt'.format(self.mode))) as f: for line in f.readlines(): if len(line.strip()) in [0, 1]: continue words = line.strip().split() assert words[0] in [str(dig) for dig in range(5)], '{} does not start with the rating'.format(words) self.sentences.append(words[1:]) print('Yelp data successfully read.') # Build vocabulary. if self.mode == 'train': print('----- Building vocab -----') build_vocab(self.sentences, voc_f, min_occur=1) # TODO # Load vocabulary. print('----- Loading vocab -----') self.vocab = Vocabulary(voc_f) print('vocabulary size:', self.vocab.size) self.pad = self.vocab.word2id['<pad>'] self.go = self.vocab.word2id['<go>'] self.eos = self.vocab.word2id['<eos>'] self.unk = self.vocab.word2id['<unk>']
def __init__(self, data_dir, seq_length, vocab_size=None, vocab=None, training=False): self.data_dir = data_dir self.seq_length = seq_length self.vocab = Vocabulary() with open(os.path.join(data_dir, "simpsons.txt"), "r", encoding="utf-8") as f: self.text = f.read() if vocab is not None: if isinstance(vocab, str): self.vocab.load(vocab) elif isinstance(vocab, Vocabulary): self.vocab = vocab elif os.path.exists(os.path.join(data_dir, "vocab.pkl")): self.vocab.load(os.path.join(data_dir, "vocab.pkl")) else: self.vocab.add_text(self.text) self.vocab.save(os.path.join(data_dir, "vocab.pkl")) if vocab_size is not None: self.vocab = self.vocab.most_common(vocab_size - 2) self.text = self.vocab.clean_text(self.text) self.tokens = self.vocab.tokenize(self.text)
def __init__(self, mode, noisy_for_train): self.mode = mode self.root = os.path.join('../data', 'yelp') voc_f = os.path.join('../data/yelp', 'yelp.vocab') if self.mode == 'dev': self.max_len = 30 else: self.max_len = 20 self.noisy = self.mode == 'train' and noisy_for_train # Load data from domain 0 and domain 1. path0 = os.path.join(self.root, 'sentiment.{}.0'.format(mode)) data0 = [] self.remove0 = [] with open(path0) as f: for i, line in enumerate(f): sent = line.split() if 4 < len(sent) < self.max_len: data0.append(sent) else: self.remove0.append(i) print('{}/{} removed from domain 0'.format( len(self.remove0), len(self.remove0) + len(data0))) path1 = os.path.join(self.root, 'sentiment.{}.1'.format(mode)) data1 = [] self.remove1 = [] with open(path1) as f: for i, line in enumerate(f): sent = line.split() if 4 < len(sent) < self.max_len: data1.append(sent) else: self.remove1.append(i) print('{}/{} removed from domain 1'.format( len(self.remove1), len(self.remove1) + len(data1))) self.l0 = len(data0) self.l1 = len(data1) # Make up for the same length. if len(data0) < len(data1): data0 = makeup(data0, len(data1)) if len(data1) < len(data0): data1 = makeup(data1, len(data0)) assert len(data0) == len(data1) self.data0 = data0 self.data1 = data1 if self.mode == 'dev': self.max_len += 5 # Load vocabulary. print('----- Loading vocab -----') self.vocab = Vocabulary(voc_f) print('vocabulary size:', self.vocab.size) self.pad = self.vocab.word2id['<pad>'] self.go = self.vocab.word2id['<go>'] self.eos = self.vocab.word2id['<eos>'] self.unk = self.vocab.word2id['<unk>']
def __init__(self, data_path, vocab=Vocabulary(), predict=False): """ Creates an object that gets data from a file. """ super(Data, self).__init__(data_path, vocab) if not predict: self._train_test_split()
def parse_exist_vocab(path): with open(path, 'rb') as f: data = pickle.load(f) idx2word = data[2] word2idx = data[3] vocab = Vocabulary() vocab.set_content(word2idx=word2idx, idx2word=idx2word) return vocab
def __init__(self, data_path, vocab=Vocabulary()): self.vocab = vocab data = get_requests_from_file(data_path) print("Downloaded {} samples".format(len(data))) map_result = map(self._process_request, data) self.lengths = [x[1] for x in map_result] map_result = map(self._process_request, data) self.data = [x[0] for x in map_result] assert len(self.data) == len(self.lengths)
def prepare_training_data_from_init(mconf, num_tasks=7): vocab_save_path = mconf.vocab_save_dir_prefix + "vocab.c{}".format( mconf.vocab_cutoff) vocab = Vocabulary() if os.path.exists(vocab_save_path): vocab.init_from_saved_vocab(vocab_save_path) else: vocab.update_vocab(mconf.data_dir_prefix + "all_text") if not os.path.exists(mconf.vocab_save_dir_prefix): os.makedirs(mconf.vocab_save_dir_prefix) vocab.save_vocab(vocab_save_path) mconf.vocab_size = vocab._size X_a, y_a, X_b, y_b = get_meta_train_data(mconf, vocab, num_tasks, save=True) return X_a, y_a, X_b, y_b, vocab
def train_node2vec(paths, params): dump_process_pkl = paths.dump_process dump_context_dict = paths.dump_context_dict dump_context_list = paths.dump_context_list dump_walks = paths.dump_walks save_model_path = paths.node2vec_base embedding_txt = paths.embedding_text embedding_temp = paths.embedding_temp embedding = paths.embedding mesh_graph_file = paths.MeSH_graph_disease if not params.randomize: np.random.seed(5) torch.manual_seed(5) random.seed(5) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") writer = SummaryWriter() # ----------- Random walk -------------------- directed_graph = False if not os.path.exists(dump_walks): num_walks = 30 walk_length = 10 nx_G = read_graph(mesh_graph_file, directed_graph) G = Graph(nx_G, is_directed=directed_graph, p=params.p, q=params.q) G.preprocess_transition_probs() walks = G.simulate_walks(num_walks, walk_length) with open(dump_walks, 'wb') as f: pickle.dump(walks, f) else: with open(dump_walks, 'rb') as f: walks = pickle.load(f) if os.path.exists(dump_process_pkl): with open(dump_process_pkl, 'rb') as f: vocab = pickle.load(f) else: vocab = Vocabulary(lower=False) vocab.add_documents(walks) vocab.build() with open(dump_process_pkl, 'wb') as f: pickle.dump(vocab, f) # ---------- build embedding model ---------- mesh_file = paths.MeSH_file ELMO_folder = paths.elmo_folder options_file = paths.elmo_options weight_file = paths.elmo_weights elmo = Elmo(options_file, weight_file, 2, dropout=0) elmo.to(device) mesh_graph = nx.read_gpickle(mesh_graph_file) mesh_graph = mesh_graph.to_undirected() mesh_dict = read_mesh_file(mesh_file) # Get the list of nodes (idx 0 is '<pad>') node_list = list(vocab.vocab.keys()) # create weight matrix by using node_list order(which correspond to original vocab index order) elmo_embedding_dim = 1024 if not os.path.exists(os.path.join(ELMO_folder, 'elmo_weights')): weight_list = [] for idx, i in enumerate(node_list): if i in mesh_dict: node_idx = vocab.token_to_id(i) scope_note = mesh_dict[i].scope_note character_ids = batch_to_ids(scope_note).to(device) elmo_embeddings = elmo(character_ids) embeddings = elmo_embeddings['elmo_representations'][0] mask = elmo_embeddings['mask'] embeddings = embeddings * mask.unsqueeze(2).expand( mask.shape[0], mask.shape[1], embeddings.shape[2]).float() embeddings = embeddings.mean(dim=0).mean(dim=0) # average weight_list.append(embeddings.cpu()) else: weight_list.append(torch.zeros(elmo_embedding_dim)) with open(os.path.join(ELMO_folder, 'elmo_weights'), 'wb') as f: pickle.dump(weight_list, f) else: with open(os.path.join(ELMO_folder, 'elmo_weights'), 'rb') as f: weight_list = pickle.load(f) weight = torch.stack(weight_list, dim=0) # ---------- train SkipGram ----------------- epochs = params.epochs batch_size = params.batch_size window = params.window num_neg_sample = params.num_neg_sample writer = SummaryWriter() # use transformation only once, i.e either during creating the context dict and list or during training if not os.path.exists(dump_context_dict): l, d = multiprocess(walks, window=window, transform=vocab.doc2id) with open(dump_context_dict, 'wb') as f: pickle.dump(d, f) with open(dump_context_list, 'wb') as f: pickle.dump(l, f) else: with open(dump_context_dict, 'rb') as f: d = pickle.load(f) with open(dump_context_list, 'rb') as f: l = pickle.load(f) # here transformation is required we will directly sample the index sample_table = negative_sampling_table(vocab.token_counter(), transform=vocab.token_to_id) neg_sample = np.random.choice(sample_table, size=(len(l), num_neg_sample)) context_data = ContextData(l, d, neg_sample, n_sample=5, transform=None) context_dataloader = DataLoader(context_data, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=6) model_embedding = SkipGramModified(len(vocab.vocab), embedding_size=elmo_embedding_dim, weight=weight) model_embedding.to(device) optimizer_FC = torch.optim.Adam(list(model_embedding.parameters()), lr=0.005) #+list(model_fc.parameters() train(model_embedding, optimizer_FC, context_dataloader, epochs, device, neg_sample, n_sample=num_neg_sample, writer=writer, save_path=save_model_path, l=l, d=d, vocab=vocab, batch_size=batch_size) node_idx = [] for item in node_list: node_idx.append(vocab.token_to_id(item)) x = torch.tensor(node_idx, device=device) y = torch.zeros(x.shape, device=device) z = torch.zeros(x.shape, device=device) x, y, z = model_embedding(x, y, z) word_embeddings = x.cpu().detach().numpy() sorted_vocab_tuple = sorted(vocab.vocab.items(), key=lambda kv: kv[1]) with open(embedding_txt, 'w') as f: for idx, item in enumerate(sorted_vocab_tuple): if item[0] == '\n': continue f.write(item[0] + ' ' + ' '.join([str(i) for i in word_embeddings[idx]]) + '\n') glove_file = datapath(embedding_txt) temp_file = get_tmpfile(embedding_temp) _ = glove2word2vec(glove_file, temp_file) wv = KeyedVectors.load_word2vec_format(temp_file) wv.save(embedding) writer.close()
def train_node2vec(paths, params): dump_process_pkl = paths.dump_process dump_context_dict = paths.dump_context_dict dump_context_list = paths.dump_context_list dump_walks = paths.dump_walks save_model_path = paths.node2vec_base embedding_txt = paths.embedding_text embedding_temp = paths.embedding_temp embedding = paths.embedding mesh_graph_file = paths.MeSH_graph_disease if not params.randomize: np.random.seed(5) torch.manual_seed(5) random.seed(5) # ----------- Random walk -------------------- directed_graph = False if not os.path.exists(dump_walks): num_walks = 30 walk_length = 8 nx_G = read_graph(mesh_graph_file, directed_graph) G = Graph(nx_G, is_directed=directed_graph, p=params.p, q=params.q) G.preprocess_transition_probs() walks = G.simulate_walks(num_walks, walk_length) with open(dump_walks, 'wb') as f: pickle.dump(walks, f) else: with open(dump_walks, 'rb') as f: walks = pickle.load(f) # ---------- train SkipGram ----------------- epochs = params.epochs batch_size = params.batch_size window = params.window num_neg_sample = params.num_neg_sample writer = SummaryWriter() if os.path.exists(dump_process_pkl): with open(dump_process_pkl, 'rb') as f: vocab = pickle.load(f) else: vocab = Vocabulary(lower=False) vocab.add_documents(walks) vocab.build() with open(dump_process_pkl, 'wb') as f: pickle.dump(vocab, f) # use transformation only once, i.e either during creating the context dict and list or during training if not os.path.exists(dump_context_dict): l, d = multiprocess(walks, window=window, transform=vocab.doc2id) with open(dump_context_dict, 'wb') as f: pickle.dump(d, f) with open(dump_context_list, 'wb') as f: pickle.dump(l, f) else: with open(dump_context_dict, 'rb') as f: d = pickle.load(f) with open(dump_context_list, 'rb') as f: l = pickle.load(f) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # here transformation is required we will directly sample the index sample_table = negative_sampling_table(vocab.token_counter(), transform=vocab.token_to_id) neg_sample = np.random.choice(sample_table, size=(len(l), num_neg_sample)) context_data = ContextData(l, d, neg_sample, n_sample=5, transform=None) context_dataloader = DataLoader(context_data, batch_size=batch_size, shuffle=True, num_workers=6) model_embedding = SkipGram(len(vocab.vocab), embedding_size=1024) model_embedding.to(device) optimizer_embedding = torch.optim.SparseAdam(model_embedding.parameters(), lr=0.005) train(model_embedding, optimizer_embedding, context_dataloader, epochs, device, neg_sample, n_sample=num_neg_sample, transform=None, writer=writer, save_path=save_model_path, l=l, d=d, vocab=vocab, batch_size=batch_size) word_embeddings = (model_embedding.out_embedding.weight.data + model_embedding.in_embedding.weight.data) / 2 word_embeddings = word_embeddings.cpu().numpy() sorted_vocab_tuple = sorted(vocab.vocab.items(), key=lambda kv: kv[1]) with open(embedding_txt, 'w') as f: for idx, item in enumerate(sorted_vocab_tuple): if item[0] == '\n': continue f.write(item[0] + ' ' + ' '.join([str(i) for i in word_embeddings[idx]]) + '\n') glove_file = datapath(embedding_txt) temp_file = get_tmpfile(embedding_temp) _ = glove2word2vec(glove_file, temp_file) wv = KeyedVectors.load_word2vec_format(temp_file) wv.save(embedding) writer.close() # if __name__ == '__main__': # base_path = '/media/druv022/Data2/Final' # paths = Paths(base_path, node2vec_type='1') # train_node2vec(paths)
def main(): # Update path training_data = r'----------------/Data/Skipgram/hansards/training.en' dump_process_pkl = r'----------------/Data/Skipgram/hansards/processed_en_w.pkl' dump_context_dict = r'----------------/Data/Skipgram/hansards/context_dict_w.pkl' dump_context_list = r'----------------/Data/Skipgram/hansards/context_list_w.pkl' save_model_path = r'----------------/Data/Skipgram/hansards' embedding_txt = r'----------------/Data/Skipgram/hansards/embedding.txt' embedding_temp = r'----------------/Data/Skipgram/hansards/embedding_temp.txt' epochs = 20 batch_size = 2**10 window = 5 num_neg_sample = 5 writer = SummaryWriter() stopwords = set(stopwords.words('english')) with open(training_data, 'r') as f: data = f.readlines() data = [line.replace('\n', '').split(' ') for line in data] data = [[word for word in line if word not in stopwords] for line in data] if os.path.exists(dump_process_pkl): with open(dump_process_pkl, 'rb') as f: vocab = pickle.load(f) else: vocab = Vocabulary() vocab.add_documents(data) vocab.build() with open(dump_process_pkl, 'wb') as f: pickle.dump(vocab, f) # use transformation only once, i.e either during creating the context dict and list or during training if not os.path.exists(dump_context_dict): l, d = multiprocess(data, window=window, transform=vocab.doc2id) with open(dump_context_dict, 'wb') as f: pickle.dump(d, f) with open(dump_context_list, 'wb') as f: pickle.dump(l, f) else: with open(dump_context_dict, 'rb') as f: d = pickle.load(f) with open(dump_context_list, 'rb') as f: l = pickle.load(f) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # here transformation is required we will directly sample the index sample_table = negative_sampling_table(vocab.token_counter(), transform=vocab.token_to_id) neg_sample = np.random.choice(sample_table, size=(len(l), num_neg_sample)) context_data = ContextData(l, d, neg_sample, n_sample=5, transform=None) context_dataloader = DataLoader(context_data, batch_size=batch_size, shuffle=True, num_workers=6) model_embedding = SkipGram(len(vocab.vocab), embedding_size=200) model_embedding.load_state_dict( torch.load(os.path.join(save_model_path, 'sk_model5_5.pkl'))) model_embedding.to(device) optimizer_embedding = torch.optim.SparseAdam(model_embedding.parameters(), lr=0.005) train(model_embedding, optimizer_embedding, context_dataloader, epochs, device, neg_sample, n_sample=num_neg_sample, save_path=save_model_path) word_embeddings = (model_embedding.out_embedding.weight.data + model_embedding.in_embedding.weight.data) / 2 word_embeddings = word_embeddings.cpu().numpy() sorted_vocab_tuple = sorted(vocab.vocab.items(), key=lambda kv: kv[1]) with open(embedding_txt, 'w') as f: for idx, item in enumerate(sorted_vocab_tuple): if item[0] == '\n': continue f.write(item[0] + ' ' + ' '.join([str(i) for i in word_embeddings[idx]]) + '\n') glove_file = datapath(embedding_txt) temp_file = get_tmpfile(embedding_temp) _ = glove2word2vec(glove_file, temp_file) wv = KeyedVectors.load_word2vec_format(temp_file) result = wv.most_similar(positive=['woman', 'king'], negative=['man']) print("{}: {:.4f}".format(*result[0])) writer.close()
def main(config): logger = config.get_logger('train') expert_dims, raw_input_dims = compute_dims(config) seeds = [int(x) for x in config._args.seeds.split(',')] for seed in seeds: tic = time.time() logger.info(f"Setting experiment random seed to {seed}") random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if config['experts']['text_feat'] == 'learnable': # vocab vocab = Vocabulary() vocab.load('dataset/captions/dict.all_200k_gan.json') vocab_size = len(vocab) if config['experts']['text_feat_init'] == True: # word2vec, download file and move to we_root-path directory # https://www.kaggle.com/jacksoncrow/word2vec-flickr30k/version/1 we_rootpath = '/home/yj/pretrained_model' w2v_data_path = os.path.join(we_rootpath, "word2vec/", 'flickr', 'vec500flickr30m') we_parameter = get_we_parameter(vocab, w2v_data_path) else: we_parameter = None else: vocab = None vocab_size = None we_parameter = None if "attr" in config['experts']['modalities']: attr_vocab = Vocabulary() attr_vocab.load('dataset/captions/dict.attr.json') attr_vocab_size = len(attr_vocab) else: attr_vocab = None attr_vocab_size = None data_loaders = config.init(name='data_loader', module=module_data, raw_input_dims=raw_input_dims, text_feat=config['experts']['text_feat'], text_dim=config['experts']['text_dim'], vocab=vocab, attr_vocab=attr_vocab, pretrain=config['trainer']['pretrain']) model = config.init( name='arch', module=module_arch, expert_dims=expert_dims, text_dim=config['experts']['text_dim'], same_dim=config['experts']['ce_shared_dim'], we_parameter=we_parameter, vocab_size=vocab_size, attr_vocab_size=attr_vocab_size, text_feat=config['experts']['text_feat'], ) # logger.info(model) loss = config.init(name='loss', module=module_loss) trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.init('optimizer', torch.optim, trainable_params) lr_scheduler = config.init('lr_scheduler', torch.optim.lr_scheduler, optimizer) trainer = Trainer( model, loss, optimizer, config=config, data_loaders=data_loaders, lr_scheduler=lr_scheduler, ) trainer.train() best_ckpt_path = config.save_dir / "trained_model.pth" duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic)) logger.info(f"Training took {duration}") test_args = argparse.ArgumentParser() test_args.add_argument("--device", default=config._args.device) test_args.add_argument("--resume", default=best_ckpt_path) test_config = ConfigParser(test_args) test(test_config)
def test(config): config.config['data_loader']['args']['mode'] = 'test' logger = config.get_logger('test') logger.info("Running test with configuration:") logger.info(config) expert_dims, raw_input_dims = compute_dims(config) if config['experts']['text_feat'] == 'learnable': # vocab vocab = Vocabulary() vocab.load('dataset/captions/dict.all_200k_gan.json') vocab_size = len(vocab) # word2vec if config['experts']['text_feat_init'] == True: # word2vec, download file and move to we_root-path directory # https://www.kaggle.com/jacksoncrow/word2vec-flickr30k/version/1 we_rootpath = '/home/yj/pretrained_model' w2v_data_path = os.path.join(we_rootpath, "word2vec/", 'flickr', 'vec500flickr30m') we_parameter = get_we_parameter(vocab, w2v_data_path) else: we_parameter = None else: vocab = None vocab_size = None we_parameter = None if "attr" in config['experts']['modalities']: attr_vocab = Vocabulary() attr_vocab.load('dataset/captions/dict.attr.json') attr_vocab_size = len(attr_vocab) else: attr_vocab = None attr_vocab_size = None data_loaders = config.init(name='data_loader', module=module_data, raw_input_dims=raw_input_dims, text_feat=config['experts']['text_feat'], text_dim=config['experts']['text_dim'], vocab=vocab, attr_vocab=attr_vocab, pretrain=config['trainer']['pretrain']) model = config.init(name='arch', module=module_arch, expert_dims=expert_dims, text_dim=config['experts']['text_dim'], same_dim=config['experts']['ce_shared_dim'], we_parameter=we_parameter, vocab_size=vocab_size, attr_vocab_size=attr_vocab_size, text_feat=config['experts']['text_feat']) ckpt_path = Path(config._args.resume) logger.info(f"Loading checkpoint: {ckpt_path} ...") checkpoint = torch.load(ckpt_path) state_dict = checkpoint['state_dict'] if config['n_gpu'] > 1: model = torch.nn.DataParallel(model) model.load_state_dict(state_dict) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info(f"Running test on {device}") model = model.to(device) model.eval() categories = ['dress', 'shirt', 'toptee'] modalities = data_loaders[categories[0]].dataset.ordered_experts metric = {'score': dict()} for i, category in enumerate(categories): val_experts = {expert: list() for expert in modalities} target_ind = {expert: list() for expert in modalities} data_asin = [] for batch in data_loaders[category + '_trg']: for key, val in batch['candidate_experts'].items(): batch['candidate_experts'][key] = val.to(device) data_asin.extend( [meta['candidate'] for meta in batch['meta_info']]) for key, val in batch['candidate_ind'].items(): target_ind[key].append(val) with torch.no_grad(): experts, _, _ = model(batch['candidate_experts'], batch['candidate_ind'], target=True) for modality, val in experts.items(): val_experts[modality].append(val) for modality, val in val_experts.items(): val_experts[modality] = torch.cat(val) for modality, val in target_ind.items(): target_ind[modality] = torch.cat(val) scores = [] meta_infos = [] val_size = val_experts['resnet'].size(0) for batch in data_loaders[category]: for experts in ['candidate_experts']: for key, val in batch[experts].items(): batch[experts][key] = val.to(device) batch["text"] = batch["text"].to(device) batch_size = batch["text"].size(0) meta_infos.extend(list(batch['meta_info'])) with torch.no_grad(): # composition_feature, text, moe_weights = model(batch['candidate_experts'], # batch['candidate_ind'], # batch['text'], # batch['text_bow'], # batch['text_lengths']) # batch_target = dict() # for mod in modalities: # tmp = [] # for k in range(batch_size): # tmp.append(model.target_composition(val_experts[mod], text[mod][k].expand(val_size, -1))) # batch_target[mod] = torch.stack(tmp) src_experts = model.image_encoder(batch['candidate_experts'], batch['candidate_ind']) src_text, moe_weights = model.get_text_feature( batch['text'], batch['candidate_ind'], batch['text_bow'], batch['text_lengths']) src_feature = model.get_combined_feature(src_experts, src_text) trg_text, _ = model.get_text_feature(batch['text'], batch['target_ind'], batch['text_bow'], batch['text_lengths'], target=True) # trg_text, _ = self.model.text_encoder['trg'](batch['text_mean'].unsqueeze(1), batch['target_ind']) batch_target = dict() for h, mod in enumerate(modalities): tmp = [] for k in range(batch_size): tmp.append( model.trg_normalization_layer( model.target_composition[h]( val_experts[mod], trg_text[mod][k].expand(val_size, -1)))) batch_target[mod] = torch.stack(tmp) cross_view_conf_matrix = sharded_cross_view_inner_product( vid_embds=batch_target, text_embds=src_feature, text_weights=moe_weights, subspaces=model.image_encoder.modalities, l2renorm=True, dist=True, val=True) scores.append(cross_view_conf_matrix) scores = torch.cat(scores) val_ids = data_loaders[category + '_trg'].dataset.data assert val_ids == data_asin metric['score'][category] = { 'ids': val_ids, 'matrix': scores, 'meta_info': meta_infos } save_fname = ckpt_path.parent / f'test_score.pt' tic = time.time() logger.info("Saving score matrix: {} ...".format(save_fname)) torch.save(metric, save_fname) logger.info(f"Done in {time.time() - tic:.3f}s")
params = { "batch_size": 128, "embed_size": 64, "hidden_size": 64, "num_layers": 2, "checkpoints": "./checkpoints/", "std_factor": 6., "dropout": 0.7, } path_normal_data = "datasets/vulnbank_train.txt" path_anomaly_data = "datasets/vulnbank_anomaly.txt" create_checkpoints_dir(params["checkpoints"]) vocab = Vocabulary() params["vocab"] = vocab #d = Data(path_normal_data) ##### x = np.linspace(0, 30, 105) y = 2 * np.sin(x) l1, = plt.plot(x[:85], y[:85], 'y', label='training samples') l2, = plt.plot(x[85:], y[85:105], 'c--', label='test samples') plt.legend(handles=[l1, l2], loc='upper left') plt.show() train_y = y.copy() noise_factor = 0.5
def test(config): logger = config.get_logger('test') logger.info("Running test with configuration:") logger.info(config) expert_dims = compute_dims(config) vocab = None vocab_size = None we_parameter = None if "attr" in config['experts']['modalities']: attr_vocab = Vocabulary() attr_vocab.load( os.path.join(config['data_loader']['args']['data_dir'], 'attributes/dict.attr.json')) attr_vocab_size = len(attr_vocab) else: attr_vocab = None attr_vocab_size = None data_loaders = config.init( name='data_loader', module=module_data, expert_dims=expert_dims, text_feat=config['experts']['text_feat'], text_dim=config['experts']['text_dim'], ) model = config.init(name='arch', module=module_arch, expert_dims=expert_dims, text_dim=config['experts']['text_dim'], same_dim=config['experts']['ce_shared_dim'], text_feat=config['experts']['text_feat']) trainer = TrainerJoint( model, loss=None, optimizer=None, config=config, data_loaders=data_loaders, lr_scheduler=None, ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info(f"Running test on {device}") metric = trainer._valid_epoch(save_textatt=True) if config._args.mode == 'val': for key, value in metric.items(): if key == 'recall_avg': logger.info(f'[Avg Recall] : {value}') elif key == 'recall_avg_corr': logger.info(f'[Avg Recall corr]: {value}') elif key == 'comb_avg': logger.info(f'[comb_avg] : {value}') elif key == 'recall': for i, category in zip(value, trainer.categories): if len(i) == 2: logger.info(f'[{category}] r@10, r@50: {i[0]}\t{i[1]}') elif len(i) == 4: logger.info( f'[{category}] comp corr r@10, r@50: {i[0]}\t{i[1]}\t{i[2]}\t{i[3]}' ) elif key == 'comb': combstr = "comb:" for i, category in zip(value, trainer.categories): combstr += f' {i[0]} {i[1]}' logger.info(combstr) else: save_fname = config.save_dir / f'test_score.pt' tic = time.time() logger.info("Saving score matrix: {} ...".format(save_fname)) torch.save(metric, save_fname) logger.info(f"Done in {time.time() - tic:.3f}s")
def main(paths, params): path_to_train_input = paths.training path_to_valid_input = paths.develop path_to_test= paths.test ctd_file = paths.ctd_file c2m_file = paths.c2m_file toD_mesh = Convert2D(ctd_file, c2m_file) sentence_pad = False # Don't pad sentence with begin and end sentence '<s>' and '<\s> device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") writer = SummaryWriter() X = BratInput(path_to_train_input) X = X.transform() X = split_annotated_documents(X) X_valid = BratInput(path_to_valid_input) X_valid = X_valid.transform() X_valid = split_annotated_documents(X_valid) X_test = BratInput(path_to_test) X_test = X_test.transform() X_test = split_annotated_documents(X_test) if params.randomize: torch.manual_seed(5) random.seed(5) np.random.seed(5) # Obtain MeSH information mesh_file = paths.MeSH_file disease_file= paths.disease_file mesh_graph_file = paths.MeSH_graph_disease mesh_folder = paths.MeSH_folder mt_folder = paths.multitask_folder # read disease file with open(disease_file,'r') as f: disease_data = f.readlines() mesh_dict = read_mesh_file(mesh_file) mesh_graph = nx.read_gpickle(mesh_graph_file) mesh_graph = mesh_graph.to_undirected() scope_text, id2idx_dict, idx2id_dict = mesh_dict_to_tokens(mesh_dict, disease_data) node_list = list(idx2id_dict.values()) # A_HAT metrix for GCN if not os.path.exists(os.path.join(mesh_folder, 'a_hat_matrix')): a_matrix = get_adjacancy_matrix(mesh_graph, node_list) a_matrix = sparse.coo_matrix(a_matrix) with open(os.path.join(mesh_folder, 'a_hat_matrix'), 'wb') as f: pickle.dump(data, f) else: with open(os.path.join(mesh_folder, 'a_hat_matrix'), 'rb') as f: a_matrix = pickle.load(f) i = torch.tensor([a_matrix.row, a_matrix.col], dtype=torch.long, device=device) v = torch.tensor(a_matrix.data, dtype=torch.float32, device=device) a_hat = torch.sparse.FloatTensor(i, v, torch.Size([len(node_list), len(node_list)])).to(device) # Construct usable data format x_tr_text, ner_tr_tags, x_tr_tokens = annotated_docs_to_tokens(X, sentence_pad=sentence_pad) x_val_text, ner_val_tags, x_val_tokens = annotated_docs_to_tokens(X_valid, sentence_pad=sentence_pad) x_test_text, ner_test_tags, x_test_tokens = annotated_docs_to_tokens(X_test, sentence_pad=sentence_pad) # elmo embeddings options_file = paths.elmo_options weight_file = paths.elmo_weights ELMO_folder = paths.elmo_folder elmo_dim = params.elmo_dim elmo = Elmo(options_file, weight_file, 2,dropout=0) elmo.to(device) with torch.no_grad(): if not os.path.exists(os.path.join(mt_folder,'text_tr_elmo_split.pkl')): text_tr = get_elmo_representation(x_tr_text, elmo, elmo_dim=params.elmo_dim, device=device) with open(os.path.join(mt_folder,'text_tr_elmo_split.pkl'),'wb+') as f: pickle.dump(text_tr, f) else: with open(os.path.join(mt_folder,'text_tr_elmo_split.pkl'),'rb+') as f: text_tr = pickle.load(f) if not os.path.exists(os.path.join(mt_folder,'text_val_elmo_split.pkl')): text_val = get_elmo_representation(x_val_text, elmo, elmo_dim=params.elmo_dim, device=device) with open(os.path.join(mt_folder,'text_val_elmo_split.pkl'),'wb+') as f: pickle.dump(text_val, f) else: with open(os.path.join(mt_folder,'text_val_elmo_split.pkl'),'rb+') as f: text_val = pickle.load(f) if not os.path.exists(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl')): text_test = get_elmo_representation(x_test_text, elmo, elmo_dim=params.elmo_dim, device=device) with open(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl'),'wb+') as f: pickle.dump(text_test, f) else: with open(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl'),'rb+') as f: text_test = pickle.load(f) # NER label vocab ner_labels_vocab = Vocabulary(lower=False) ner_labels_vocab.add_documents(ner_tr_tags) ner_labels_vocab.build() # mesh scope embedding if not os.path.exists(os.path.join(paths.dump_folder, 'scope_emb.pkl')): scope_embedding, _ = get_scope_elmo(elmo, ELMO_folder, scope_text, elmo_dim, idx2id_dict, id2idx_dict, device=device) with open(os.path.join(paths.dump_folder, 'scope_emb.pkl'), 'wb') as f: pickle.dump(scope_embedding, f) else: with open(os.path.join(paths.dump_folder, 'scope_emb.pkl'), 'rb') as f: scope_embedding = pickle.load(f) train_el_set = EL_set(X, toD_mesh, id2idx_dict) val_el_set = EL_set(X_valid, toD_mesh, id2idx_dict) train(paths, params, X, text_tr, ner_tr_tags, train_el_set, X_valid, x_val_tokens, text_val, ner_val_tags, val_el_set, ner_labels_vocab, scope_text, scope_embedding, a_hat, mesh_graph, id2idx_dict, idx2id_dict, writer, device=device)