def triplet2content_fn(triplets_url, entity2contents_url, save_url): """ :param triplets_url: :param entity2contents_url: :param data_json_url: :return: """ entity2content = json_util.load(entity2contents_url) triplet_file = open(triplets_url, 'r', encoding="utf-8") triplet2centent = {} for line in triplet_file: triplet = line.strip().split(' ') entity1 = triplet[0] relation = triplet[2] entity2 = triplet[1] # print(entity1) # print(entity2) # print(entity2content[entity1]) # exit() if entity1 in entity2content[entity1] and entity2 in entity2content[ entity1]: key = entity1 + "#" + relation + "#" + entity2 if key not in triplet2centent.keys(): triplet2centent[key] = entity2content[entity1] json_util.dump(triplet2centent, save_url)
def calc_tf(data_url=DATA_URL, update=False, ngram=1): """ calc the tf value of all tokens Args: data_url: url to data file update: update dict even it exists ngram: max_n for ngram Returns: dict: tf dict {word: tf_value} """ level = 'phrase' if 'phrase' in data_url else 'word' tf_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_tf.json".format( level, ngram)) if not update and exists(tf_url): return ju.load(tf_url) tf_dict = collections.defaultdict(int) _, sentences = load_raw_data(data_url, ngram=ngram) for sentence in tqdm(sentences): for word in sentence: tf_dict[word] += 1 ju.dump(ju.sort_dict_by_value(tf_dict, reverse=True), tf_url)
def calc_bdc(data_url=DATA_URL, update=False, ngram=1): """ calc the bdc value of all tokens Args: data_url: url to data file update: update dict even it exists ngram: maxn for ngram Returns: dict: bdc dict {word: bdc_value} """ level = 'phrase' if 'phrase' in data_url else 'word' bdc_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_bdc.json".format( level, ngram)) dc_url = from_project_root( "processed_data/saved_weight/{}_level_{}gram_dc.json".format( level, ngram)) if not update and exists(bdc_url): return ju.load(bdc_url) labels, sentences = load_raw_data(data_url, ngram=ngram) word_label_dict = collections.defaultdict(dict) # store f(t, c_i) label_words_num = collections.defaultdict(int) # to store all f(c_i) for label, sentence in tqdm(zip(labels, sentences), total=len(labels)): label_words_num[label] += len(sentence) for word in sentence: try: word_label_dict[word][label] += 1 except KeyError: word_label_dict[word][label] = 1 bdc_dict = collections.defaultdict(float) dc_dict = collections.defaultdict(float) for word in tqdm(word_label_dict): # for calc dc arr = np.array(list( word_label_dict[word].values())) # f(t, c_i) for all labels arr = arr / arr.sum() # f(t, c_i) / f(t) arr = np.log(arr) * arr dc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num)) # norm # for calc bdc for label in word_label_dict[word]: word_label_dict[word][label] /= label_words_num[ label] # p(t, c_i) = f(t, c_i) / f(c_i) arr = np.array(list( word_label_dict[word].values())) # p(t, c_i) for all labels arr = arr / arr.sum() # p(t, c_i) / sum(p(t, c_i)) arr = np.log(arr) * arr bdc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num)) # norm # to sort save calculated result ju.dump(ju.sort_dict_by_value(bdc_dict), bdc_url) ju.dump(ju.sort_dict_by_value(dc_dict), dc_url) return bdc_dict
def keepEntityAndSentences(init_urls, save_url): """ :param init_urls: 原来目标文件路径 :param save_url: 处理完的文件路径 :return: """ json_data = {} for init_url in init_urls: file = open(init_url, 'r', encoding='utf-8') for line in file: data_units = line.split('\t') if data_units[0] not in json_data.keys(): json_data[data_units[0]] = data_units[2] json_util.dump(json_data, save_url)
def human_add_question(question_url, save_url): """ :param question_url: :param entity: :return: """ human_questions_json = {} question_data = open(question_url, 'r', encoding="utf-8") for line in question_data: items = line.strip().split("#") if items[0] not in human_questions_json.keys(): human_questions_json[items[0]] = [] ques = {} ques["question"] = items[1] ques["correct"] = items[-1] ques["items"] = [] keys = ["A", "B", "C", "D"] for i in range(len(keys)): ques["items"].append("{}: {}".format(keys[i], items[2 + i])) human_questions_json[items[0]].append(ques) json_util.dump(human_questions_json, save_url)
def gen_vocab_from_data(data_urls): """ given the train, dev, test and aug.txt gen the char vocab :param data_urls: train, dev, test and aug.txt :return: """ # creat the char_vocab.json file char_vocab_url = join("./data/running", "char_vocab.json") if os.path.exists(char_vocab_url): return char_vocab = set() print("generating char vocab from {}".format(" ".join(data_urls))) for data_url in data_urls: with open(data_url, 'r', encoding='utf-8') as f: for row in f: if row != "\n": _, dna = row.strip("\n").split("\t") char_vocab = char_vocab.union(dna) # sorting vocab according alphabet order char_vocab = sorted(char_vocab) char_vocab = ['<pad>'] + char_vocab ju.dump(ju.list_to_dict(char_vocab), char_vocab_url)
def transfer2json(triplet2content_file): triplet2content = json_util.load(triplet2content_file) """ 构造nodes.json """ names = [] nodes = [] links = [] for triplet in triplet2content.keys(): units = triplet.split("#") if units[0] not in names: names.append(units[0]) if units[1] not in names: names.append(units[2]) for name in names: node = {} node['category'] = 0 node['name'] = name nodes.append(node) """ 构造links """ # 保证tripelt唯一性 triplets = [] for triplet in triplet2content.keys(): if triplet not in triplets: triplets.append(triplet) for triplet in triplets: units = triplet.split("#") link = {} link["source"] = units[0] link["target"] = units[2] link["name"] = units[1] links.append(link) # save file json_util.dump(nodes, from_project_root("data/analogyKG_nodes.json")) json_util.dump(links, from_project_root("data/analogyKG_links.json"))
def gen_vocab_from_data(data_urls, pretrained_url, binary=True, update=False, min_count=1): """ generate vocabulary and embeddings from data file, generated vocab files will be saved in data dir Args: data_urls: url to data file(s), list or string pretrained_url: url to pretrained embedding file binary: binary for load word2vec update: force to update even vocab file exists min_count: minimum count of a word Returns: generated word embedding url """ if isinstance(data_urls, str): data_urls = [data_urls] data_dir = os.path.dirname(data_urls[0]) vocab_url = os.path.join(data_dir, "vocab.json") char_vocab_url = os.path.join(data_dir, "char_vocab.json") embedding_url = os.path.join(data_dir, "embeddings.npy") if pretrained_url else None if (not update) and os.path.exists(vocab_url): print("vocab file already exists") return embedding_url vocab = set() char_vocab = set() word_counts = defaultdict(int) print("generating vocab from", data_urls) for data_url in data_urls: with open(data_url, 'r', encoding='utf-8') as data_file: for row in data_file: if row == '\n': continue token = row.split()[0] word_counts[token] += 1 if word_counts[token] > min_count: vocab.add(row.split()[0]) char_vocab = char_vocab.union(row.split()[0]) # sorting vocab according alphabet order vocab = sorted(vocab) char_vocab = sorted(char_vocab) # generate word embeddings for vocab if pretrained_url is not None: print("generating pre-trained embedding from", pretrained_url) kvs = KeyedVectors.load_word2vec_format(pretrained_url, binary=binary) embeddings = list() for word in vocab: if word in kvs: embeddings.append(kvs[word]) else: embeddings.append( np.random.uniform(-0.25, 0.25, kvs.vector_size)), char_vocab = ['<pad', '<unk>'] + char_vocab vocab = ['<pad>', '<unk>'] + vocab ju.dump(ju.list_to_dict(vocab), vocab_url) ju.dump(ju.list_to_dict(char_vocab), char_vocab_url) if pretrained_url is None: return embeddings = np.vstack([ np.zeros(kvs.vector_size), # for <pad> np.random.uniform(-0.25, 0.25, kvs.vector_size), # for <unk> embeddings ]) np.save(embedding_url, embeddings) return embedding_url