def get_term_weight(x_text_arr, max_doc_len, term_weight_file): dc_dict = ju.load(term_weight_file) term_weights = [] for x in x_text_arr: x_word_list = x.strip().split() sen_length = len(x_word_list) # 计算文档级别的tf tf_dict = collections.defaultdict(int) for word in x_word_list: tf_dict[word] += 1 term_weight = [0] * max_doc_len for i in range(min(max_doc_len, len(x_word_list))): term_weight[i] = tf_dict[x_word_list[i]] / sen_length * dc_dict[ x_word_list[i]] # 进行归一化 term_weight = np.array(term_weight) max_value = term_weight.max() min_value = term_weight.min() mid_value = max_value - min_value if mid_value == 0: # 加一操作,防止遇到0的现象 term_weight = [1 for value in term_weight] else: term_weight = [((value - min_value) / mid_value) * 2 + 1 for value in term_weight] term_weights.append(term_weight) return term_weights
def calc_tf(data_url=DATA_URL, update=False, ngram=1): """ calc the tf value of all tokens Args: data_url: url to data file update: update dict even it exists ngram: max_n for ngram Returns: dict: tf dict {word: tf_value} """ level = 'phrase' if 'phrase' in data_url else 'word' tf_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_tf.json".format( level, ngram)) if not update and exists(tf_url): return ju.load(tf_url) tf_dict = collections.defaultdict(int) _, sentences = load_raw_data(data_url, ngram=ngram) for sentence in tqdm(sentences): for word in sentence: tf_dict[word] += 1 ju.dump(ju.sort_dict_by_value(tf_dict, reverse=True), tf_url)
def main(): json_url = from_project_root("processed_data/entity2contents.json") json_data = json_util.load(json_url) print(json_data["红楼梦"]) exit()
def triplet2content_fn(triplets_url, entity2contents_url, save_url): """ :param triplets_url: :param entity2contents_url: :param data_json_url: :return: """ entity2content = json_util.load(entity2contents_url) triplet_file = open(triplets_url, 'r', encoding="utf-8") triplet2centent = {} for line in triplet_file: triplet = line.strip().split(' ') entity1 = triplet[0] relation = triplet[2] entity2 = triplet[1] # print(entity1) # print(entity2) # print(entity2content[entity1]) # exit() if entity1 in entity2content[entity1] and entity2 in entity2content[ entity1]: key = entity1 + "#" + relation + "#" + entity2 if key not in triplet2centent.keys(): triplet2centent[key] = entity2content[entity1] json_util.dump(triplet2centent, save_url)
def calc_bdc(data_url=DATA_URL, update=False, ngram=1): """ calc the bdc value of all tokens Args: data_url: url to data file update: update dict even it exists ngram: maxn for ngram Returns: dict: bdc dict {word: bdc_value} """ level = 'phrase' if 'phrase' in data_url else 'word' bdc_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_bdc.json".format( level, ngram)) dc_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_dc.json".format( level, ngram)) if not update and exists(bdc_url): return ju.load(bdc_url) labels, sentences = load_raw_data(data_url, ngram=ngram) word_label_dict = collections.defaultdict(dict) # store f(t, c_i) label_words_num = collections.defaultdict(int) # to store all f(c_i) for label, sentence in tqdm(zip(labels, sentences), total=len(labels)): label_words_num[label] += len(sentence) for word in sentence: try: word_label_dict[word][label] += 1 except KeyError: word_label_dict[word][label] = 1 bdc_dict = collections.defaultdict(float) dc_dict = collections.defaultdict(float) for word in tqdm(word_label_dict): # for calc dc arr = np.array(list( word_label_dict[word].values())) # f(t, c_i) for all labels arr = arr / arr.sum() # f(t, c_i) / f(t) arr = np.log(arr) * arr dc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num)) # norm # for calc bdc for label in word_label_dict[word]: word_label_dict[word][label] /= label_words_num[ label] # p(t, c_i) = f(t, c_i) / f(c_i) arr = np.array(list( word_label_dict[word].values())) # p(t, c_i) for all labels arr = arr / arr.sum() # p(t, c_i) / sum(p(t, c_i)) arr = np.log(arr) * arr bdc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num)) # norm # to sort save calculated result ju.dump(ju.sort_dict_by_value(bdc_dict), bdc_url) ju.dump(ju.sort_dict_by_value(dc_dict), dc_url) return bdc_dict
def calc_dc(data_url=DATA_URL, update=False, ngram=1): """ calc the dc value of all tokens Args: data_url: url to data file update: update dict even it exists ngram: maxn for ngram Returns: dict: dc dict {word: dc_value} """ level = 'phrase' if 'phrase' in data_url else 'word' dc_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_dc.json".format( level, ngram)) if not update and exists(dc_url): return ju.load(dc_url) calc_bdc(DATA_URL, update=True, ngram=ngram) return ju.load(dc_url)
def main(entity): # 加载数据 triplet2content_url = from_project_root( "processed_data/triplet2contents.csv") triplet2content = json_util.load(triplet2content_url) # print(triplet2content.keys()) # print(triplet2content["朱自清#职业#诗人"]) # exit() # triple # triplet = "水浒传#创作年代#元末明初" # questions = generatorBytriplet(triplet2content, triplet) # print(questions) questions = [] # entity = "红楼梦" model_questions = generatorByEntity(triplet2content, entity) question_data_file = from_project_root("processed_data/humanQuestion.json") human_questions = json_util.load(question_data_file) if entity in human_questions.keys(): questions.extend(human_questions[entity]) questions.extend(model_questions) print(questions) return questions
def load_raw(data_url): save_url = data_url.replace('.dat', '.raw.pkl') if os.path.exists(save_url): return joblib.load(save_url) char_vocab = ju.load(join("./data/running", "char_vocab.json")) labels, txts = [], [] with open(data_url, "r") as f: for line in f: line_split = line.strip("\n").split("\t") labels.append(line_split[1]) char_index = [] for char in line_split[0]: char_index.append(char_vocab[char]) txts.append(char_index) return txts, labels
def main(): # 加载数据 triplet2content_url = from_project_root("processed_data/triplet2contents.csv") triplet2content = json_util.load(triplet2content_url) # print(triplet2content.keys()) # exit() # triple # t = "红楼梦#作者#曹雪芹" # # triplet = "水浒传#创作年代#元末明初" # questions = generatorBytriplet(triplet2content, triplet) # print(questions) entity = "红楼梦" questions = generatorByEntity(triplet2content, entity) print(questions) return questions
def transfer2json(triplet2content_file): triplet2content = json_util.load(triplet2content_file) """ 构造nodes.json """ names = [] nodes = [] links = [] for triplet in triplet2content.keys(): units = triplet.split("#") if units[0] not in names: names.append(units[0]) if units[1] not in names: names.append(units[2]) for name in names: node = {} node['category'] = 0 node['name'] = name nodes.append(node) """ 构造links """ # 保证tripelt唯一性 triplets = [] for triplet in triplet2content.keys(): if triplet not in triplets: triplets.append(triplet) for triplet in triplets: units = triplet.split("#") link = {} link["source"] = units[0] link["target"] = units[2] link["name"] = units[1] links.append(link) # save file json_util.dump(nodes, from_project_root("data/analogyKG_nodes.json")) json_util.dump(links, from_project_root("data/analogyKG_links.json"))
def train(n_epochs=30, embedding_url=None, char_feat_dim=50, freeze=False, train_url=TRAIN_URL, dev_url=DEV_URL, test_url=None, max_region=10, learning_rate=0.001, batch_size=100, early_stop=5, clip_norm=5, device='auto', save_only_best = True ): """ Train deep exhaustive model, Sohrab et al. 2018 EMNLP Args: n_epochs: number of epochs embedding_url: url to pretrained embedding file, set as None to use random embedding char_feat_dim: size of character level feature freeze: whether to freeze embedding train_url: url to train data dev_url: url to dev data test_url: url to test data for evaluating, set to None for not evaluating max_region: max entity region size learning_rate: learning rate batch_size: batch_size early_stop: early stop for training clip_norm: whether to perform norm clipping, set to 0 if not need device: device for torch save_only_best: only save model of best performance """ # print arguments arguments = json.dumps(vars(), indent=2) print("exhaustive model is training with arguments", arguments) device = get_device(device) train_set = ExhaustiveDataset(train_url, device=device, max_region=max_region) train_loader = DataLoader(train_set, batch_size=batch_size, drop_last=False, collate_fn=train_set.collate_func) vocab = ju.load(VOCAB_URL) n_words = len(vocab) char_vocab = ju.load(VOCAB_URL.replace('vocab', 'char_vocab')) n_chars = len(char_vocab) model = ExhaustiveModel( hidden_size=200, n_tags=train_set.n_tags + 1, char_feat_dim=char_feat_dim, embedding_url=embedding_url, bidirectional=True, max_region=max_region, n_embeddings=n_words, n_chars = n_chars, embedding_dim=200, freeze=freeze ) if device.type == 'cuda': print("using gpu,", torch.cuda.device_count(), "gpu(s) available!\n") # model = nn.DataParallel(model) else: print("using cpu\n") model = model.to(device) criterion = F.cross_entropy optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) max_f1, max_f1_epoch, cnt = 0, 0, 0 # ignore the padding part when calcuting loss tag_weights = torch.Tensor([1] * train_set.n_tags + [0]).to(device) best_model_url = None # train and evaluate model for epoch in range(n_epochs): # switch to train mode model.train() batch_id = 0 for data, labels, _ in train_loader: optimizer.zero_grad() outputs = model.forward(*data) # use weight parameter to skip padding part loss = criterion(outputs, labels, weight=tag_weights) loss.backward() # gradient clipping if clip_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm) optimizer.step() endl = '\n' if batch_id % LOG_PER_BATCH == 0 else '\r' sys.stdout.write("epoch #%d, batch #%d, loss: %.6f, %s%s" % (epoch, batch_id, loss.item(), datetime.now().strftime("%X"), endl)) sys.stdout.flush() batch_id += 1 cnt += 1 # metrics on development set dev_metrics = evaluate(model, dev_url) if dev_metrics['f1'] > max_f1: max_f1 = dev_metrics['f1'] max_f1_epoch = epoch if save_only_best and best_model_url: os.remove(best_model_url) best_model_url = from_project_root( "data/model/exhaustive_model_epoch%d_%f.pt" % (epoch, max_f1)) torch.save(model, best_model_url) cnt = 0 print("maximum of f1 value: %.6f, in epoch #%d\n" % (max_f1, max_f1_epoch)) if cnt >= early_stop > 0: break print('\n') if test_url and best_model_url: model = torch.load(best_model_url) print("best model url:", best_model_url) print("evaluating on test dataset:", test_url) evaluate(model, test_url) print(arguments)
def gen_sentence_tensors(sentence_list, device, data_url): """ generate input tensors from sentence list Args: sentence_list: list of raw sentence device: torch device data_url: data_url used to locate vocab files Returns: sentences, tensor sentence_lengths, tensor sentence_words, list of tensor sentence_word_lengths, list of tensor sentence_word_indices, list of tensor """ vocab = ju.load(dirname(data_url) + '/vocab.json') char_vocab = ju.load(dirname(data_url) + '/char_vocab.json') sentences = list() sentence_words = list() sentence_word_lengths = list() sentence_word_indices = list() unk_idx = 1 for sent in sentence_list: # word to word id sentence = torch.LongTensor([ vocab[word] if word in vocab else unk_idx for word in sent ]).to(device) # char of word to char id words = list() for word in sent: words.append([ char_vocab[ch] if ch in char_vocab else unk_idx for ch in word ]) # save word lengths word_lengths = torch.LongTensor([len(word) for word in words]).to(device) # sorting lengths according to length word_lengths, word_indices = torch.sort(word_lengths, descending=True) # sorting word according word length words = np.array(words)[word_indices.cpu().numpy()] word_indices = word_indices.to(device) words = [torch.LongTensor(word).to(device) for word in words] # padding char tensor of words words = pad_sequence(words, batch_first=True).to(device) # (max_word_len, sent_len) sentences.append(sentence) sentence_words.append(words) sentence_word_lengths.append(word_lengths) sentence_word_indices.append(word_indices) # record sentence length and padding sentences sentence_lengths = [len(sentence) for sentence in sentences] # (batch_size) sentences = pad_sequence(sentences, batch_first=True).to(device) # (batch_size, max_sent_len) return sentences, sentence_lengths, sentence_words, sentence_word_lengths, sentence_word_indices
# coding: utf-8 # created by deng on 7/27/2018 from utils.path_util import from_project_root import utils.json_util as ju DATA_URL = from_project_root("processed_data/phrase_level_data_train.csv") BDC_DICT = ju.load(from_project_root("processed_data/saved_weight/phrase_level_bdc.json")) DC_DICT = ju.load(from_project_root("processed_data/saved_weight/phrase_level_dc.json")) TF_DICT = ju.load(from_project_root("processed_data/saved_weight/phrase_level_tf.json")) def filtered_by_dict(word, dic=BDC_DICT, lower=5., upper=1.e5): """ filtering words according their tf Args: word: the sentence to process dic: the dict to use lower: lower bound upper: upper bound Returns: bool: True if the word should be filtered """ return dic[word] < lower or dic[word] > upper def process_data(data_url=DATA_URL, save_url=None): """ process data according to specific rules
def pre_processed_sen(bdc_pickle, tf_pickle, dc_pickle, train_file, processed_data_file, limit_word=400): """ :param bdc_pickle: 根据全局计算出来的bdc权重 :param train_file: 级别上的数据文件,此文件需要初步的去噪.[] :param limit_word: 限定每个样本文档不重复词语的个数[阈值] :param processed_data_file: 预处理好的文档路径 :return: """ # 加载bdc_value # bdc_dict = ju.load(bdc_pickle) # 加载tf_value tf_dict = ju.load(tf_pickle) # 加载dc_value dc_dict = ju.load(dc_pickle) line_count = 0 # 读取训练文档 with open(train_file, 'r', encoding='utf-8') as f, open(processed_data_file, 'w', encoding='utf-8') as wf: for line in f.readlines(): print("filtered_line={}".format(line_count)) line_count += 1 line_list = line.strip().split(',') # 预处理完的词语列表 processed_word_list = [] # 记录词语的权重 label = line_list[0] word_list = line_list[1].strip().split() # 过滤超高词频的词语========================== filted_word_list = [] for word in word_list: if tf_dict[word] <= 2 or tf_dict[word] > 7500: continue filted_word_list.append(word) sen_len = len(filted_word_list) # 作归一化使用,以免句子的长度影响最后句子级别上的权重 # 计算句子级别上tf ============================== word_dict = collections.defaultdict(float) for word in filted_word_list: word_dict[word] += 1.0 # 归一化,计算tf-bdc value ========================= for (word, tf_value) in word_dict.items(): word_dict[word] = word_dict[word] / sen_len * dc_dict[word] # 对word_dict权重进行排序: 从大到小排序 ============================= sorted_word_tuple = sorted(word_dict.items(), key=lambda item: item[1], reverse=True) if len(sorted_word_tuple) < limit_word: # 如果小于阈值,无需压缩 processed_word_list = filted_word_list wf.write("{},{}\n".format(label, ' '.join(processed_word_list))) continue # 截取前limit_word阈值的词语,并将tuple转化成list类型================================= keep_words = [] for (word, tf_bdc_value) in sorted_word_tuple[:limit_word]: keep_words.append(word) # for word in filted_word_list: if word in keep_words: processed_word_list.append(word) wf.write("{},{}\n".format(label, ' '.join(processed_word_list)))