def json2vocab(filenames, vocab_filename, vocab_size, valid_users=None, valid_subreddits=None, overwrite=False): """Reads all the .json files and keeps the top words mentioned in them by the valid users and subreddits. Args: filenames: list of paths where the .json files are. vocab_filename: String with the path of the vocabulary. vocab_size: Total number of words to be used, ie top-k limit. valid_users: Set of users whose words should be kept. valid_subreddits: Set of subreddits whose words should be kept. overwrite: Whether to overwrite existing file. Returns: A set of words. Saves: A set of words and a text file with word, count (for sanity check). """ counter_filename = vocab_filename.replace('pkl', 'txt') if os.path.exists(vocab_filename) and not overwrite: return load_pickle(vocab_filename, False) print 'Making:\n%s\n%s' % (vocab_filename, counter_filename) counters = [] limit = vocab_size pool = mp.Pool(n_proc) proc_data_size = int(np.ceil(1. * len(filenames) / n_proc)) for i in range(n_proc): proc_filenames = filenames[i * proc_data_size:(i + 1) * proc_data_size] if len(proc_filenames) > 0: pool.apply_async(_json2vocab_mp, args=(i, proc_filenames, valid_subreddits, valid_users), callback=counters.append) pool.close() pool.join() combined_counters = combine_dicts(counters) print 'Total words before pruning were %d' % len(combined_counters) sorted_vocab = sorted(combined_counters.items(), key=lambda x: x[1], reverse=True) vocab = set([x[0] for x in sorted_vocab[:limit - 3]]) # explicity add unk and sentence start and end tokens. vocab.add('<unk>') vocab.add('<sent_end>') vocab.add('<sent_start>') vocab = set_to_dict(vocab, 1) # word ids start from 1!! save_pickle(vocab_filename, vocab) final_counter = np.array(sorted_vocab[:limit]) save_txt(counter_filename, final_counter, delimiter=' ', fmt='%s') return vocab
def get_word_embeddings(dataset='9sr'): """Returns 300d embedding matrix based on 50k vocab from first_all.""" filename = os.path.join(word_embeddings_dir, 'query_completion', '%s_glove_matrix_300d.pkl' % dataset) if os.path.exists(filename): return load_pickle(filename, False) else: print ' I do not have word embeddings for %s. Making it now... Takes a few minutes.' % dataset
def create_valid_subreddit_set(subscribers_dict, subscriber_limit=1000, overwrite=False): """Make a set of subreddits with more than subscriber_limit subscribers, based on an input dictionary.""" subreddit_set_filename = get_valid_sub_name(subscriber_limit) if os.path.exists(subreddit_set_filename) and not overwrite: return load_pickle(subreddit_set_filename, False) sub_set = set() # get it? for (subreddit, subscriber_count) in subscribers_dict.iteritems(): if subscriber_count >= subscriber_limit: sub_set.add(subreddit) print '-->Sub set has %d subreddits' % len(sub_set) save_pickle(subreddit_set_filename, sub_set) return sub_set
def create_valid_user_set(params, years=None, overwrite=False): """Creates a set of valid users, meaning users with more than min_posts posts""" filename = get_valid_user_filename(params, years) if os.path.exists(filename) and not overwrite: return load_pickle(filename, False) user_counts = combine_dicts(get_user_count_dictionaries(params, years)) usernames = get_top_users(params.min_posts, user_counts) usernames = remove_bots(usernames, params) print '--> Total valid users: %d' % len(usernames) save_pickle(filename, usernames) return usernames
def _dict2matrix_mp(proc_id, counts_filenames, valid_subreddits, valid_users, to_remove=None): """ Convert a list of dictionaries into a COO array. Map splits user-cat in user and cat and returns tuple: (user, cat, count) Filter makes sure user is in valid users and cat in valid categories. (user and cat are strings and are turned into id's after) Incomprehensible filter and map. Map, splits Args: proc_id: id of process counts_filenames: filename of dictionaries to be loaded valid_subreddits: set of subreddits to be considered. valid_users: set of users to be considered. to_remove: set of users to be removed if this is a test set. """ categories = set_to_dict(valid_subreddits) users = set_to_dict(valid_users) R, C = len(users), len(categories) result_data = np.zeros((0, 3)) for filename in counts_filenames: print proc_id, filename counts = load_pickle(filename, False).items() sys.stdout.flush() points = filter( lambda x: x[0] in valid_users and x[1] in valid_subreddits, map(lambda x: (x[0].split(' ')[0], x[0].split(' ')[1], int(x[1])), counts)) data = np.zeros((len(points), 3)) for i, p in enumerate(points): data[i] = [users[p[0]], categories[p[1]], p[2]] # save the partial array for downweighting later if to_remove is not None: for u in to_remove: mask = np.where(data[:, 0] == u)[0] data[mask, 2] = 0 data = np.vstack((data, np.array([R - 1, C - 1, 0]))) save_filename = filename.replace('uc_dict.pkl', 'UxS_%d.npy' % len(valid_users)) save_array(save_filename, data, False) print proc_id, len(data) result_data = np.vstack((result_data, data)) sys.stdout.flush() return result_data
def get_most_popular(min_subscribers, subreddit_limit=50000, overwrite=False): """Reads, or crawls (if it does not exist) subreddit -> subscribers dictionary. If overwrite, it crawls the data from the internet again. Else, it reads the existing file. """ subscribers_dict = {} subscribers_dict_filename = get_sub_dict_name(subreddit_limit) if os.path.exists(subscribers_dict_filename) and not overwrite: subscribers_dict = load_pickle(subscribers_dict_filename, False) return create_valid_subreddit_set(subscribers_dict, min_subscribers, overwrite) # fast - run anyway subscribers_dict = crawl_subreddit_subscribers(subreddit_limit, subscribers_dict, subscribers_dict_filename) print '--> Subscriber Dict has %d entries' % len(subscribers_dict) return create_valid_subreddit_set(subscribers_dict, min_subscribers, overwrite) # fast - run anyway
def lm_valid_users(filename, params, uxs=None, user_names=None, years=None, overwrite=False): """Creates the valid users for language modeling, after applying all previous filters + min h_index filter. Args: filename: language model valid users filename to be saved. params: Parameters of the preprocessing run uxs: User by Subreddit count matrix (sparse). user_names: dictionary from user id to user_name. years: list of all the years we want to take into consideration. If none, it selects all available. overwrite: Boolean to define whether to overwrite existing file. Returns: A set of user names (who all had an h_index larger than that specified in params. """ if os.path.exists(filename) and not overwrite: return load_pickle(filename, False) if user_names is None: user_set = create_valid_user_set(params, years, overwrite) user_names = invert_dict(set_to_dict(user_set)) if uxs is None: uxs = data_to_sparse( dict2matrix(params, valid_users=user_set, years=years)) assert len(user_names) == uxs.shape[0] print 'Calculating h-indices...' user_h_index = [] for u in range(uxs.shape[0]): counts = sorted(uxs.getrow(u).data, reverse=True) user_h_index.append(get_h_index(counts)) user_h_index = np.array(user_h_index) top_users = np.where(user_h_index >= params.h_index_min)[0] top_usernames = set([user_names[u] for u in top_users]) print 'Total Users: %d -> after pruning with at least %d h-index, %d user left' % ( len(user_names), params.h_index_min, len(top_usernames)) save_pickle(filename, top_usernames) return top_usernames
def train_mixture_model(train, val, test, method='logP', recall_k=100, dataset_name='d_name', overwrite=False, num_proc=None): """ Runs the main experiment of the paper, finding the best mixing weights per user for these two components. Learns the weights per user and saves them in a file. If file exists it just loads it. It evaluates on the test set. There is a memory component, where a person has been in the past (exploit), and global component, which is the population preferences (explore). Data come in COO form. That is a numpy array of (N x 3) where each row is the (row, column, value) triplet of the sparse array Users x Categories. N is the number of entries in the array. :param train: train data COO matrix :param val: validation data COO matrix :param test: test data COO matrix :param method: Method of evaluation. Can be 'logP' or 'recall' for log probability per event, or recall@k :param recall_k: the k for recall@k. If method is 'logP' this does nothing. :param dataset_name: Name of the directory the results will be saved. :param overwrite: Boolean, on whether to overwrite learned weights or read them if they exist. :param num_proc: Number of processes to be used. If none, all the processors in the machine will be used. :return: returns an array of mixing weights, which is n_users x 2 (2 components, self and global) """ filename = os.path.join(results_dir, 'mixture_model', dataset_name, 'mixing_weights.pkl') if os.path.exists(filename) and not overwrite: mix_weights = load_pickle(filename, False) else: train_matrix, global_matrix = get_train_global(train, val, test) components = [train_matrix, global_matrix] # can add more components here mix_weights = learn_mixing_weights(components, val, num_proc=num_proc) save_pickle(filename, mix_weights, False) evaluate_method(train, val, test, mix_weights, method, recall_k) return mix_weights
def main(config, progress): # save config with open("./log/configs.json", "a") as f: json.dump(config, f) f.write("\n") cprint("*"*80) cprint("Experiment progress: {0:.2f}%".format(progress*100)) cprint("*"*80) metrics = {} # data hyper-params data_path = config["data_path"] keyword_path = config["keyword_path"] pretrained_wordvec_path = config["pretrained_wordvec_path"] data_dir = "/".join(data_path.split("/")[:-1]) dataset = data_path.split("/")[-2] # convai2 or casual test_mode = bool(config["test_mode"]) save_model_path = config["save_model_path"] min_context_len = config["min_context_len"] max_context_len = config["max_context_len"] max_sent_len = config["max_sent_len"] max_keyword_len = config["max_keyword_len"] max_vocab_size = config["max_vocab_size"] max_keyword_vocab_size = config["max_keyword_vocab_size"] remove_self_loop = bool(config["remove_self_loop"]) # model hyper-params config_id = config["config_id"] model = config["model"] gnn = config["gnn"] aggregation = config["aggregation"] utterance_encoder = config["utterance_encoder"] use_last_k_utterances = config["use_last_k_utterances"] use_CN_hopk_graph = config["use_CN_hopk_graph"] use_utterance_concepts = bool(config["use_utterance_concepts"]) combine_node_emb = config["combine_node_emb"] # replace, mean, max, concat, concept_encoder = config["concept_encoder"] embed_size = config["embed_size"] use_pretrained_word_embedding = bool(config["use_pretrained_word_embedding"]) fix_word_embedding = bool(config["fix_word_embedding"]) hidden_size = config["hidden_size"] n_layers = config["n_layers"] bidirectional = bool(config["bidirectional"]) n_heads = config["n_heads"] dropout = config["dropout"] # training hyper-params batch_size = config["batch_size"] epochs = config["epochs"] lr = config["lr"] lr_decay = config["lr_decay"] seed = config["seed"] device = torch.device(config["device"]) fp16 = bool(config["fp16"]) fp16_opt_level = config["fp16_opt_level"] # set seed random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if "convai2" in data_dir and min_context_len != 2: raise ValueError("convai2 dataset has min context len of 2") if use_pretrained_word_embedding and str(embed_size) not in pretrained_wordvec_path: raise ValueError("embedding size and pretrained_wordvec_path not match") # load data cprint("Loading conversation data...") train, valid, test = load_pickle(data_path) train_keyword, valid_keyword, test_keyword = load_pickle(keyword_path) if test_mode: cprint("Testing model...") train = train + valid train_keyword = train_keyword + valid_keyword valid = test valid_keyword = test_keyword cprint(len(train), len(train_keyword), len(valid), len(valid_keyword)) cprint("sample train: ", train[0]) cprint("sample train keyword: ", train_keyword[0]) cprint("sample valid: ", valid[0]) cprint("sample valid keyword: ", valid_keyword[0]) # clip and pad data train_padded_convs, train_padded_keywords = pad_and_clip_data(train, train_keyword, min_context_len, max_context_len+1, max_sent_len, max_keyword_len) valid_padded_convs, valid_padded_keywords = pad_and_clip_data(valid, valid_keyword, min_context_len, max_context_len+1, max_sent_len, max_keyword_len) cprint(len(train_padded_convs), len(train_padded_keywords), len(valid_padded_convs), len(valid_padded_keywords)) cprint("sample padded train: ", train_padded_convs[0]) cprint("sample padded train keyword: ", train_padded_keywords[0]) cprint("sample padded valid: ", valid_padded_convs[0]) cprint("sample padded valid keyword: ", valid_padded_keywords[0]) # build vocab if "convai2" in data_dir: test_padded_convs, _ = pad_and_clip_data(test, test_keyword, min_context_len, max_context_len+1, max_sent_len, max_keyword_len) word2id = build_vocab(train_padded_convs + valid_padded_convs + test_padded_convs, max_vocab_size) # use entire dataset for vocab as done in (tang 2019) else: word2id = build_vocab(train_padded_convs, max_vocab_size) keyword2id = build_vocab(train_padded_keywords, max_keyword_vocab_size) id2keyword = {idx:w for w, idx in keyword2id.items()} for w in keyword2id: if w not in word2id: word2id[w] = len(word2id) # add OOV keywords to word2id id2word = {idx:w for w, idx in word2id.items()} keywordid2wordid = [word2id[id2keyword[i]] if id2keyword[i] in word2id else word2id["<unk>"] for i in range(len(keyword2id))] vocab_size = len(word2id) keyword_vocab_size = len(keyword2id) cprint("vocab size: ", vocab_size) cprint("keyword vocab size: ", keyword_vocab_size) CN_hopk_edge_index, CN_hopk_nodeid2wordid, keywordid2nodeid, node2id = None, None, None, None keyword_mask_matrix = None if use_CN_hopk_graph > 0: cprint("Loading CN_hopk edge index...") """ CN_graph_dict: { edge_index: 2D list (num_edges, 2), edge_type: list (num_edges, ), edge_weight: list (num_edges, ), relation2id: {}, nodeid2wordid: 2D list (num_nodes, 10) } """ CN_hopk_graph_path = "./data/{0}/CN_graph_{1}hop_ge1.pkl".format(dataset, use_CN_hopk_graph) cprint("Loading graph from ", CN_hopk_graph_path) CN_hopk_graph_dict = load_nx_graph_hopk(CN_hopk_graph_path, word2id, keyword2id) CN_hopk_edge_index = torch.LongTensor(CN_hopk_graph_dict["edge_index"]).transpose(0,1).to(device) # (2, num_edges) CN_hopk_nodeid2wordid = torch.LongTensor(CN_hopk_graph_dict["nodeid2wordid"]).to(device) # (num_nodes, 10) node2id = CN_hopk_graph_dict["node2id"] id2node = {idx:w for w,idx in node2id.items()} keywordid2nodeid = [node2id[id2keyword[i]] if id2keyword[i] in node2id else node2id["<unk>"] for i in range(len(keyword2id))] keywordid2nodeid = torch.LongTensor(keywordid2nodeid).to(device) keyword_mask_matrix = torch.from_numpy(CN_hopk_graph_dict["edge_mask"]).float() # numpy array of (keyword_vocab_size, keyword_vocab_size) cprint("building keyword mask matrix...") if remove_self_loop: keyword_mask_matrix[torch.arange(keyword_vocab_size), torch.arange(keyword_vocab_size)] = 0 cprint("keyword mask matrix non-zeros ratio: ", keyword_mask_matrix.mean()) cprint("average number of neighbors: ", keyword_mask_matrix.sum(dim=1).mean()) cprint("sample keyword mask matrix: ", keyword_mask_matrix[:8,:8]) keyword_mask_matrix = keyword_mask_matrix.to(device) cprint("edge index shape: ", CN_hopk_edge_index.shape) cprint("edge index[:,:8]", CN_hopk_edge_index[:,:8]) cprint("nodeid2wordid shape: ", CN_hopk_nodeid2wordid.shape) cprint("nodeid2wordid[:5,:8]", CN_hopk_nodeid2wordid[:5,:8]) cprint("keywordid2nodeid shape: ", keywordid2nodeid.shape) cprint("keywordid2nodeid[:8]", keywordid2nodeid[:8]) # convert edge index if utterance_encoder != "": keywordid2wordid = torch.LongTensor(keywordid2wordid).to(device) cprint("keywordid2wordid shape: ", keywordid2wordid.shape) cprint("keywordid2wordid", keywordid2wordid[:8]) # convert tokens to ids train_conv_ids = convert_convs_to_ids(train_padded_convs, word2id) valid_conv_ids = convert_convs_to_ids(valid_padded_convs, word2id) train_keyword_ids = convert_convs_to_ids(train_padded_keywords, keyword2id) valid_keyword_ids = convert_convs_to_ids(valid_padded_keywords, keyword2id) cprint(len(train_conv_ids), len(train_keyword_ids), len(valid_conv_ids), len(valid_keyword_ids)) cprint("sample train token ids: ", train_conv_ids[0]) cprint("sample train keyword ids: ", train_keyword_ids[0]) cprint("sample valid token ids: ", valid_conv_ids[0]) cprint("sample valid keyword ids: ", valid_keyword_ids[0]) num_examples = len(train_keyword_ids) # create model if model in ["KW_GNN"]: model_kwargs = { "embed_size": embed_size, "vocab_size": vocab_size, "keyword_vocab_size": keyword_vocab_size, "hidden_size": hidden_size, "output_size": hidden_size, "n_layers": n_layers, "gnn": gnn, "aggregation": aggregation, "n_heads": n_heads, "dropout": dropout, "bidirectional": bidirectional, "utterance_encoder": utterance_encoder, "keywordid2wordid": keywordid2wordid, "keyword_mask_matrix": keyword_mask_matrix, "nodeid2wordid": CN_hopk_nodeid2wordid, "keywordid2nodeid": keywordid2nodeid, "concept_encoder": concept_encoder, "combine_node_emb": combine_node_emb } cprint("Building model...") model = globals()[config["model"]](**model_kwargs) # cprint(model.edge_weight.shape, model.edge_weight.requires_grad) pretrained_word_embedding = None if use_pretrained_word_embedding: # load pretrained word embedding cprint("Loading pretrained word embeddings...") pretrained_wordvec_name = pretrained_wordvec_path.split("/")[-1][:-4] word_vectors_path = os.path.join(data_dir, "word_vectors_{0}.pkl".format(pretrained_wordvec_name)) keyword2id = word2id if os.path.exists(word_vectors_path): cprint("Loading pretrained word embeddings from ", word_vectors_path) with open(word_vectors_path, "rb") as f: word_vectors = pickle.load(f) else: cprint("Loading pretrained word embeddings from scratch...") word_vectors = load_vectors(pretrained_wordvec_path, keyword2id) cprint("Saving pretrained word embeddings to ", word_vectors_path) with open(word_vectors_path, "wb") as f: pickle.dump(word_vectors, f) print("loaded word vector size: ", len(word_vectors)) pretrained_word_embedding = np.zeros((len(keyword2id), embed_size)) for w, i in keyword2id.items(): if w in word_vectors: pretrained_word_embedding[i] = np.array(word_vectors[w]) else: pretrained_word_embedding[i] = np.random.randn(embed_size)/9 pretrained_word_embedding[0] = 0 # 0 for PAD embedding pretrained_word_embedding = torch.from_numpy(pretrained_word_embedding).float() cprint("word embedding size: ", pretrained_word_embedding.shape) model.init_embedding(pretrained_word_embedding, fix_word_embedding) cprint(model) cprint("number of parameters: ", count_parameters(model)) model.to(device) # optimization amp = None if fp16: from apex import amp optimizer = torch.optim.Adam(model.parameters(), lr=lr) # scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: lr_decay ** epoch) scheduler = LambdaLR(optimizer, lr_lambda=lambda step: 1/(1+lr_decay*step/(num_examples/batch_size))) if fp16: model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) # training epoch_train_losses = [] epoch_valid_losses = [] epoch_valid_precisions = [] epoch_valid_recalls = [] best_model_statedict = {} cprint("Start training...") for epoch in range(epochs): cprint("-"*80) cprint("Epoch", epoch+1) train_batches = create_batches_keyword_prediction(train_conv_ids, train_keyword_ids, 2*max_keyword_len, batch_size, \ shuffle=True, remove_self_loop=remove_self_loop, keywordid2wordid=keywordid2wordid, \ keyword_mask_matrix=keyword_mask_matrix.cpu().numpy(), use_last_k_utterances=use_last_k_utterances, use_utterance_concepts=use_utterance_concepts, \ keyword2id=keyword2id, node2id=node2id, id2word=id2word) valid_batches = create_batches_keyword_prediction(valid_conv_ids, valid_keyword_ids, 2*max_keyword_len, batch_size, \ shuffle=False, remove_self_loop=remove_self_loop, keywordid2wordid=keywordid2wordid, \ keyword_mask_matrix=keyword_mask_matrix.cpu().numpy(), use_last_k_utterances=use_last_k_utterances, use_utterance_concepts=use_utterance_concepts, \ keyword2id=keyword2id, node2id=node2id, id2word=id2word) cprint("train batches 1st example: ") for k, v in train_batches[0].items(): if k == "batch_X_keywords": cprint(k, v[0], [id2keyword[w] for w in v[0]]) if k == "batch_X_utterances": utters = [] for utter in v[0]: utters.append([id2word[w] for w in utter]) cprint(k, v[0], utters) if k == "batch_X_concepts" and len(v) > 0: cprint(k, v[0], [id2node[w] for w in v[0]]) if k == "batch_y": cprint(k, v[0], [id2keyword[w] for w in v[0]]) model.train() train_loss, (train_precision, train_recall) = run_epoch(train_batches, model, optimizer, epoch=epoch, training=True, device=device, \ fp16=fp16, amp=amp, step_scheduler=scheduler, keyword_mask_matrix=keyword_mask_matrix, keywordid2wordid=keywordid2wordid, \ CN_hopk_edge_index=CN_hopk_edge_index, use_utterance_concepts=use_utterance_concepts) cprint("Config id: {}, Epoch {}: train precision: {}, train recall: {}" .format(config_id, epoch+1, train_precision, train_recall)) model.eval() valid_loss, (valid_precision, valid_recall) = run_epoch(valid_batches, model, optimizer, epoch=epoch, training=False, device=device, \ keyword_mask_matrix=keyword_mask_matrix, keywordid2wordid=keywordid2wordid, \ CN_hopk_edge_index=CN_hopk_edge_index, use_utterance_concepts=use_utterance_concepts) # scheduler.step() cprint("Config id: {}, Epoch {}: train loss: {}, valid loss: {}, valid precision: {}, valid recall: {}" .format(config_id, epoch+1, train_loss, valid_loss, valid_precision, valid_recall)) if scheduler is not None: cprint("Current learning rate: ", scheduler.get_last_lr()) epoch_train_losses.append(train_loss) epoch_valid_losses.append(valid_loss) epoch_valid_precisions.append(valid_precision) epoch_valid_recalls.append(valid_recall) if save_model_path != "": if epoch == 0: for k, v in model.state_dict().items(): best_model_statedict[k] = v.cpu() else: if epoch_valid_recalls[-1][0] == max([recall1 for recall1, _, _ in epoch_valid_recalls]): for k, v in model.state_dict().items(): best_model_statedict[k] = v.cpu() # early stopping if len(epoch_valid_recalls) >= 3 and epoch_valid_recalls[-1][0] < epoch_valid_recalls[-2][0] and epoch_valid_recalls[-2][0] < epoch_valid_recalls[-3][0]: break config.pop("seed") config.pop("config_id") metrics["config"] = config metrics["score"] = max([recall[0] for recall in epoch_valid_recalls]) metrics["epoch"] = np.argmax([recall[0] for recall in epoch_valid_recalls]).item() metrics["recall"] = epoch_valid_recalls[metrics["epoch"]] metrics["precision"] = epoch_valid_precisions[metrics["epoch"]] if save_model_path: cprint("Saving model to ", save_model_path) best_model_statedict["word2id"] = keyword2id best_model_statedict["model_kwargs"] = model_kwargs torch.save(best_model_statedict, save_model_path) return metrics
def main(config, progress): # save config with open("./log/configs.json", "a") as f: json.dump(config, f) f.write("\n") cprint("*" * 80) cprint("Experiment progress: {0:.2f}%".format(progress * 100)) cprint("*" * 80) metrics = {} # data hyper-params data_path = config["data_path"] keyword_path = config["keyword_path"] pretrained_wordvec_path = config["pretrained_wordvec_path"] data_dir = "/".join(data_path.split("/")[:-1]) dataset = data_path.split("/")[-2] # convai2 or casual test_mode = bool(config["test_mode"]) save_model_path = config["save_model_path"] load_kw_prediction_path = config["load_kw_prediction_path"] min_context_len = config["min_context_len"] max_context_len = config["max_context_len"] max_sent_len = config["max_sent_len"] max_keyword_len = config["max_keyword_len"] max_vocab_size = config["max_vocab_size"] max_keyword_vocab_size = config["max_keyword_vocab_size"] flatten_context = config["flatten_context"] # model hyper-params config_id = config["config_id"] model = config["model"] use_CN_hopk_graph = config["use_CN_hopk_graph"] use_utterance_concepts = use_CN_hopk_graph > 0 concept_encoder = config["concept_encoder"] combine_word_concepts = config["combine_word_concepts"] gnn = config["gnn"] encoder = config["encoder"] aggregation = config["aggregation"] use_keywords = bool(config["use_keywords"]) keyword_score_weight = config["keyword_score_weight"] keyword_encoder = config["keyword_encoder"] # mean, max, GRU, any_max embed_size = config["embed_size"] use_pretrained_word_embedding = bool( config["use_pretrained_word_embedding"]) fix_word_embedding = bool(config["fix_word_embedding"]) gnn_hidden_size = config["gnn_hidden_size"] gnn_layers = config["gnn_layers"] encoder_hidden_size = config["encoder_hidden_size"] encoder_layers = config["encoder_layers"] n_heads = config["n_heads"] dropout = config["dropout"] # training hyper-params batch_size = config["batch_size"] epochs = config["epochs"] lr = config["lr"] lr_decay = config["lr_decay"] seed = config["seed"] device = torch.device(config["device"]) fp16 = bool(config["fp16"]) fp16_opt_level = config["fp16_opt_level"] # set seed random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if "convai2" in data_dir and min_context_len != 2: raise ValueError("convai2 dataset has min context len of 2") if use_pretrained_word_embedding and str( embed_size) not in pretrained_wordvec_path: raise ValueError( "embedding size and pretrained_wordvec_path not match") if use_keywords and load_kw_prediction_path == "": raise ValueError( "kw model path needs to be provided when use_keywords is True") # load data cprint("Loading conversation data...") train, valid, test = load_pickle(data_path) train_keyword, valid_keyword, test_keyword = load_pickle(keyword_path) train_candidate, valid_candidate = None, None # load 20 candidates train_candidate, valid_candidate, test_candidate = load_pickle( os.path.join(data_dir, "candidate.pkl")) if test_mode: cprint("Testing model...") train = train + valid train_keyword = train_keyword + valid_keyword valid = test valid_keyword = test_keyword train_candidate = train_candidate + valid_candidate valid_candidate = test_candidate cprint("sample train: ", train[0]) cprint("sample train keyword: ", train_keyword[0]) cprint("sample valid: ", valid[0]) cprint("sample valid keyword: ", valid_keyword[0]) # clip and pad data train_padded_convs, train_padded_keywords = pad_and_clip_data( train, train_keyword, min_context_len, max_context_len + 1, max_sent_len, max_keyword_len) valid_padded_convs, valid_padded_keywords = pad_and_clip_data( valid, valid_keyword, min_context_len, max_context_len + 1, max_sent_len, max_keyword_len) train_padded_candidates = pad_and_clip_candidate(train_candidate, max_sent_len) valid_padded_candidates = pad_and_clip_candidate(valid_candidate, max_sent_len) # build vocab if "convai2" in data_dir: test_padded_convs, _ = pad_and_clip_data(test, test_keyword, min_context_len, max_context_len + 1, max_sent_len, max_keyword_len) word2id = build_vocab(train_padded_convs + valid_padded_convs + test_padded_convs, max_vocab_size) # use entire dataset for vocab else: word2id = build_vocab(train_padded_convs, max_vocab_size) keyword2id = build_vocab(train_padded_keywords, max_keyword_vocab_size) id2keyword = {idx: w for w, idx in keyword2id.items()} for w in keyword2id: if w not in word2id: word2id[w] = len(word2id) # add OOV keywords to word2id id2word = {idx: w for w, idx in word2id.items()} cprint("keywords that are not in word2id: ", set(keyword2id.keys()) - set(word2id.keys())) vocab_size = len(word2id) keyword_vocab_size = len(keyword2id) cprint("vocab size: ", vocab_size) cprint("keyword vocab size: ", keyword_vocab_size) # create a mapping from keyword id to word id keywordid2wordid = None train_candidate_keyword_ids, valid_candidate_keyword_ids = None, None if use_keywords: keywordid2wordid = [ word2id[id2keyword[i]] if id2keyword[i] in word2id else word2id["<unk>"] for i in range(len(keyword2id)) ] keywordid2wordid = torch.LongTensor(keywordid2wordid).to(device) # load candidate keywords candidate_keyword_path = os.path.join(data_dir, "candidate_keyword.pkl") if os.path.exists(candidate_keyword_path): cprint("Loading candidate keywords from ", candidate_keyword_path) train_candidate_keywords, valid_candidate_keywords, test_candidate_keywords = load_pickle( candidate_keyword_path) else: cprint("Creating candidate keywords...") train_candidate_keywords = extract_keywords_from_candidates( train_candidate, keyword2id) valid_candidate_keywords = extract_keywords_from_candidates( valid_candidate, keyword2id) test_candidate_keywords = extract_keywords_from_candidates( test_candidate, keyword2id) save_pickle((train_candidate_keywords, valid_candidate_keywords, test_candidate_keywords), candidate_keyword_path) if test_mode: train_candidate_keywords = train_candidate_keywords + valid_candidate_keywords valid_candidate_keywords = test_candidate_keywords # pad cprint("Padding candidate keywords...") train_padded_candidate_keywords = pad_and_clip_candidate( train_candidate_keywords, max_keyword_len) valid_padded_candidate_keywords = pad_and_clip_candidate( valid_candidate_keywords, max_keyword_len) # convert candidates to ids cprint("Converting candidate keywords to ids...") train_candidate_keyword_ids = convert_candidates_to_ids( train_padded_candidate_keywords, keyword2id) valid_candidate_keyword_ids = convert_candidates_to_ids( valid_padded_candidate_keywords, keyword2id) # load CN graph CN_hopk_edge_index, CN_hopk_nodeid2wordid, keywordid2nodeid, node2id, CN_hopk_edge_matrix_mask = None, None, None, None, None if use_CN_hopk_graph > 0: cprint("Loading CN_hopk edge index...") """ CN_graph_dict: { edge_index: 2D list (num_edges, 2), edge_weight: list (num_edges, ), nodeid2wordid: 2D list (num_nodes, 10), edge_mask: numpy array of (keyword_vocab_size, keyword_vocab_size) } """ CN_hopk_graph_path = "./data/{0}/CN_graph_{1}hop_ge1.pkl".format( dataset, use_CN_hopk_graph) cprint("Loading graph from ", CN_hopk_graph_path) CN_hopk_graph_dict = load_nx_graph_hopk(CN_hopk_graph_path, word2id, keyword2id) CN_hopk_edge_index = torch.LongTensor( CN_hopk_graph_dict["edge_index"]).transpose(0, 1).to( device) # (2, num_edges) CN_hopk_nodeid2wordid = torch.LongTensor( CN_hopk_graph_dict["nodeid2wordid"]).to(device) # (num_nodes, 10) node2id = CN_hopk_graph_dict["node2id"] id2node = {idx: w for w, idx in node2id.items()} keywordid2nodeid = [ node2id[id2keyword[i]] if id2keyword[i] in node2id else node2id["<unk>"] for i in range(len(keyword2id)) ] keywordid2nodeid = torch.LongTensor(keywordid2nodeid).to(device) cprint("edge index shape: ", CN_hopk_edge_index.shape) cprint("edge index[:,:8]", CN_hopk_edge_index[:, :8]) cprint("nodeid2wordid shape: ", CN_hopk_nodeid2wordid.shape) cprint("nodeid2wordid[:5,:8]", CN_hopk_nodeid2wordid[:5, :8]) cprint("keywordid2nodeid shape: ", keywordid2nodeid.shape) cprint("keywordid2nodeid[:8]", keywordid2nodeid[:8]) # convert tokens to ids train_conv_ids = convert_convs_to_ids(train_padded_convs, word2id) valid_conv_ids = convert_convs_to_ids(valid_padded_convs, word2id) train_keyword_ids = convert_convs_to_ids(train_padded_keywords, keyword2id) valid_keyword_ids = convert_convs_to_ids(valid_padded_keywords, keyword2id) train_candidate_ids, valid_candidate_ids = None, None train_candidate_ids = convert_candidates_to_ids(train_padded_candidates, word2id) valid_candidate_ids = convert_candidates_to_ids(valid_padded_candidates, word2id) keyword_mask_matrix = None if use_CN_hopk_graph > 0: keyword_mask_matrix = torch.from_numpy( CN_hopk_graph_dict["edge_mask"]).float( ) # numpy array of (keyword_vocab_size, keyword_vocab_size) cprint("building keyword mask matrix...") keyword_mask_matrix[ torch.arange(keyword_vocab_size), torch.arange(keyword_vocab_size)] = 0 # remove self loop cprint("keyword mask matrix non-zeros ratio: ", keyword_mask_matrix.mean()) cprint("average number of neighbors: ", keyword_mask_matrix.sum(dim=1).mean()) cprint("sample keyword mask matrix: ", keyword_mask_matrix[:8, :8]) keyword_mask_matrix = keyword_mask_matrix.to(device) num_examples = len(train_conv_ids) cprint("sample train token ids: ", train_conv_ids[0]) cprint("sample train keyword ids: ", train_keyword_ids[0]) cprint("sample valid token ids: ", valid_conv_ids[0]) cprint("sample valid keyword ids: ", valid_keyword_ids[0]) cprint("sample train candidate ids: ", train_candidate_ids[0]) cprint("sample valid candidate ids: ", valid_candidate_ids[0]) if use_keywords: cprint("sample train candidate keyword ids: ", train_candidate_keyword_ids[0]) cprint("sample valid candidate keyword ids: ", valid_candidate_keyword_ids[0]) # create model if model in ["CoGraphMatcher"]: model_kwargs = { "embed_size": embed_size, "vocab_size": vocab_size, "gnn_hidden_size": gnn_hidden_size, "gnn_layers": gnn_layers, "encoder_hidden_size": encoder_hidden_size, "encoder_layers": encoder_layers, "n_heads": n_heads, "CN_hopk_edge_matrix_mask": CN_hopk_edge_matrix_mask, "nodeid2wordid": CN_hopk_nodeid2wordid, "keywordid2wordid": keywordid2wordid, "keywordid2nodeid": keywordid2nodeid, "concept_encoder": concept_encoder, "gnn": gnn, "encoder": encoder, "aggregation": aggregation, "use_keywords": use_keywords, "keyword_score_weight": keyword_score_weight, "keyword_encoder": keyword_encoder, "dropout": dropout, "combine_word_concepts": combine_word_concepts } # create keyword model kw_model = "" use_last_k_utterances = -1 if use_keywords: kw_model = load_kw_prediction_path.split( "/")[-1][:-3] # keyword prediction model name if "GNN" in kw_model: kw_model = "KW_GNN" use_last_k_utterances = 2 # load pretrained model cprint("Loading weights from ", load_kw_prediction_path) kw_model_checkpoint = torch.load(load_kw_prediction_path, map_location=device) if "word2id" in kw_model_checkpoint: keyword2id = kw_model_checkpoint.pop("word2id") if "model_kwargs" in kw_model_checkpoint: kw_model_kwargs = kw_model_checkpoint.pop("model_kwargs") kw_model = globals()[kw_model](**kw_model_kwargs) kw_model.load_state_dict(kw_model_checkpoint) kw_model.to(device) kw_model.eval() # set to evaluation mode, no training required cprint("Building model...") model = globals()[config["model"]](**model_kwargs) cprint("Initializing pretrained word embeddings...") pretrained_word_embedding = None if use_pretrained_word_embedding: # load pretrained word embedding cprint("Loading pretrained word embeddings...") pretrained_wordvec_name = pretrained_wordvec_path.split("/")[-1][:-4] word_vectors_path = os.path.join( data_dir, "word_vectors_{0}.pkl".format(pretrained_wordvec_name)) if os.path.exists(word_vectors_path): cprint("Loading pretrained word embeddings from ", word_vectors_path) with open(word_vectors_path, "rb") as f: word_vectors = pickle.load(f) else: cprint("Loading pretrained word embeddings from scratch...") word_vectors = load_vectors(pretrained_wordvec_path, word2id) cprint("Saving pretrained word embeddings to ", word_vectors_path) with open(word_vectors_path, "wb") as f: pickle.dump(word_vectors, f) cprint("pretrained word embedding size: ", len(word_vectors)) pretrained_word_embedding = np.zeros((len(word2id), embed_size)) for w, i in word2id.items(): if w in word_vectors: pretrained_word_embedding[i] = np.array(word_vectors[w]) else: pretrained_word_embedding[i] = np.random.randn(embed_size) / 9 pretrained_word_embedding[0] = 0 # 0 for PAD embedding pretrained_word_embedding = torch.from_numpy( pretrained_word_embedding).float() cprint("word embedding size: ", pretrained_word_embedding.shape) model.init_embedding(pretrained_word_embedding, fix_word_embedding) cprint(model) cprint("number of parameters: ", count_parameters(model)) model.to(device) # optimization amp = None if fp16: from apex import amp optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = LambdaLR(optimizer, lr_lambda=lambda step: 1 / (1 + lr_decay * step / (num_examples / batch_size))) if fp16: model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) # training epoch_train_losses = [] epoch_valid_losses = [] epoch_valid_precisions = [] epoch_valid_recalls = [] epoch_valid_MRRs = [] best_model_statedict = {} cprint("Start training...") for epoch in range(epochs): cprint("-" * 80) cprint("Epoch", epoch + 1) train_batches = create_batches_retrieval(train_conv_ids, train_keyword_ids, train_candidate_ids, train_candidate_keyword_ids, \ 2*max_keyword_len, batch_size, shuffle=True, use_keywords=use_keywords, use_candidate_keywords=use_keywords, use_utterance_concepts=use_utterance_concepts, \ node2id=node2id, id2word=id2word, flatten_context=flatten_context, use_last_k_utterances=use_last_k_utterances) valid_batches = create_batches_retrieval(valid_conv_ids, valid_keyword_ids, valid_candidate_ids, valid_candidate_keyword_ids, \ 2*max_keyword_len, batch_size, shuffle=False, use_keywords=use_keywords, use_candidate_keywords=use_keywords, use_utterance_concepts=use_utterance_concepts, \ node2id=node2id, id2word=id2word, flatten_context=flatten_context, use_last_k_utterances=use_last_k_utterances) if epoch == 0: cprint("number of optimization steps per epoch: ", len(train_batches)) # 3361 cprint("train batches 1st example: ") for k, v in train_batches[0].items(): if k == "batch_context": utters = [] for utter in v[0]: utters.append([id2word[w] for w in utter]) cprint("\n", k, v[0], utters) if k == "batch_candidates": utters = [] for utter in v[0]: utters.append([id2word[w] for w in utter]) cprint("\n", k, v[0], utters) if k == "batch_context_kw": cprint("\n", k, v[0], [id2keyword[w] for w in v[0]]) if k == "batch_candidates_kw": utters = [] for utter in v[0]: utters.append([id2keyword[w] for w in utter]) cprint("\n", k, v[0], utters) if k == "batch_context_concepts": if len(v[0][0]) > 0: utters = [] for utter in v[0]: utters.append([id2node[w] for w in utter]) cprint("\n", k, v[0], utters) if k == "batch_candidates_concepts": utters = [] for utter in v[0]: utters.append([id2node[w] for w in utter]) cprint("\n", k, v[0], utters) if k == "batch_context_for_keyword_prediction": utters = [] for utter in v[0]: utters.append([id2word[w] for w in utter]) cprint("\n", k, v[0], utters) if k == "batch_context_concepts_for_keyword_prediction": cprint("\n", k, v[0], [id2node[w] for w in v[0]]) model.train() train_loss, (_, _, _) = run_epoch(train_batches, model, optimizer, training=True, device=device, fp16=fp16, amp=amp, \ kw_model=kw_model, keyword_mask_matrix=keyword_mask_matrix, step_scheduler=scheduler, keywordid2wordid=keywordid2wordid, \ CN_hopk_edge_index=CN_hopk_edge_index) model.eval() valid_loss, (valid_precision, valid_recall, valid_MRR) = run_epoch(valid_batches, model, optimizer, training=False, device=device, \ kw_model=kw_model, keyword_mask_matrix=keyword_mask_matrix, keywordid2wordid=keywordid2wordid, CN_hopk_edge_index=CN_hopk_edge_index) # scheduler.step() cprint( "Config id: {0}, Epoch {1}: train loss: {2:.4f}, valid loss: {3:.4f}, valid precision: {4}, valid recall: {5}, valid MRR: {6}" .format(config_id, epoch + 1, train_loss, valid_loss, valid_precision, valid_recall, valid_MRR)) if scheduler is not None: cprint("Current learning rate: ", scheduler.get_last_lr()) epoch_train_losses.append(train_loss) epoch_valid_losses.append(valid_loss) epoch_valid_precisions.append(valid_precision) epoch_valid_recalls.append(valid_recall) epoch_valid_MRRs.append(valid_MRR) if save_model_path != "": if epoch == 0: for k, v in model.state_dict().items(): best_model_statedict[k] = v.cpu() else: if epoch_valid_recalls[-1][0] == max( [recall1 for recall1, _, _ in epoch_valid_recalls]): for k, v in model.state_dict().items(): best_model_statedict[k] = v.cpu() # early stopping if len(epoch_valid_recalls) >= 3 and epoch_valid_recalls[-1][ 0] < epoch_valid_recalls[-2][0] and epoch_valid_recalls[-2][ 0] < epoch_valid_recalls[-3][0]: break config.pop("seed") config.pop("config_id") metrics["config"] = config metrics["score"] = max([recall[0] for recall in epoch_valid_recalls]) metrics["epoch"] = np.argmax([recall[0] for recall in epoch_valid_recalls]).item() metrics["recall"] = epoch_valid_recalls[metrics["epoch"]] metrics["MRR"] = epoch_valid_MRRs[metrics["epoch"]] metrics["precision"] = epoch_valid_precisions[metrics["epoch"]] if save_model_path and seed == 1: cprint("Saving model to ", save_model_path) best_model_statedict["word2id"] = word2id best_model_statedict["model_kwargs"] = model_kwargs torch.save(best_model_statedict, save_model_path) return metrics
prob, spn = spn_for_evidence( spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges) print(prob) if __name__ == '__main__': import os from util import io np.random.seed(123) path = os.path.dirname(os.path.abspath(__file__)) + "/" if os.path.exists(path + "spn.pkl"): spn = io.load_pickle(path + "spn.pkl") else: data, parametric_types = create_p_value_dataset() spn = learn_parametric_spn(data, parametric_types) io.dump_pickle(path, "spn.pkl", spn) print(spn) visualize_Density_2d(spn) #visualize_Density(spn) #g_nodes = get_nodes_with_weight(spn, 2) #visualize_Gaussian(g_nodes) #spn_util.plot_spn(spn, "new.pdf")
args = parser.parse_args() dataset = args.dataset chat_model_name = args.chat_model kw_model_name = args.kw_model num_sessions = args.num_sessions max_turns = args.max_turns device = args.device if args.device >= 0 else "cpu" device = torch.device(device) apply_commonsense = "Commonsense" in chat_model_name # load starting corpus: a list of greeting sentences from the training conversations # load target set: a list of target keywords print("Loading data...") start_corpus = load_pickle("./data/{0}/start_corpus.pkl".format(dataset)) target_set = load_pickle("./data/{0}/target_set.pkl".format(dataset)) candidate_pool = load_pickle("./data/{0}/candidate_pool.pkl".format(dataset)) # all candidates from training set # candidate_pool = random.sample(candidate_pool, 2000) # load chat model chat_model_path = "./saved_model/{0}/{1}.pt".format(dataset, chat_model_name) print("Loading chat model from {0}...".format(chat_model_path)) chat_model_checkpoint = torch.load(chat_model_path, map_location=device) word2id = chat_model_checkpoint.pop("word2id") id2word = {idx: w for w, idx in word2id.items()} print("word vocab size: ", len(word2id)) chat_model_kwargs = chat_model_checkpoint.pop("model_kwargs") if "Commonsense" in chat_model_name: # chat_model_name.replace("_Commonsense", "") # chat_model_name = chat_model_name if "_" not in chat_model_name else chat_model_name.split("_")[0]