def triplet2content_fn(triplets_url, entity2contents_url, save_url):
    """
    :param triplets_url:
    :param entity2contents_url:
    :param data_json_url:
    :return:
    """
    entity2content = json_util.load(entity2contents_url)
    triplet_file = open(triplets_url, 'r', encoding="utf-8")
    triplet2centent = {}
    for line in triplet_file:
        triplet = line.strip().split(' ')
        entity1 = triplet[0]
        relation = triplet[2]
        entity2 = triplet[1]
        # print(entity1)
        # print(entity2)
        # print(entity2content[entity1])
        # exit()
        if entity1 in entity2content[entity1] and entity2 in entity2content[
                entity1]:
            key = entity1 + "#" + relation + "#" + entity2
            if key not in triplet2centent.keys():
                triplet2centent[key] = entity2content[entity1]

    json_util.dump(triplet2centent, save_url)
def calc_tf(data_url=DATA_URL, update=False, ngram=1):
    """ calc the tf value of all tokens

    Args:
        data_url: url to data file
        update: update dict even it exists
        ngram: max_n for ngram

    Returns:
        dict: tf dict {word: tf_value}

    """
    level = 'phrase' if 'phrase' in data_url else 'word'
    tf_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_tf.json".format(
            level, ngram))
    if not update and exists(tf_url):
        return ju.load(tf_url)

    tf_dict = collections.defaultdict(int)
    _, sentences = load_raw_data(data_url, ngram=ngram)
    for sentence in tqdm(sentences):
        for word in sentence:
            tf_dict[word] += 1

    ju.dump(ju.sort_dict_by_value(tf_dict, reverse=True), tf_url)
def calc_bdc(data_url=DATA_URL, update=False, ngram=1):
    """ calc the bdc value of all tokens

    Args:
        data_url: url to data file
        update: update dict even it exists
        ngram: maxn for ngram

    Returns:
        dict: bdc dict {word: bdc_value}

    """
    level = 'phrase' if 'phrase' in data_url else 'word'
    bdc_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_bdc.json".format(
            level, ngram))
    dc_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_dc.json".format(
            level, ngram))
    if not update and exists(bdc_url):
        return ju.load(bdc_url)

    labels, sentences = load_raw_data(data_url, ngram=ngram)
    word_label_dict = collections.defaultdict(dict)  # store f(t, c_i)
    label_words_num = collections.defaultdict(int)  # to store all f(c_i)
    for label, sentence in tqdm(zip(labels, sentences), total=len(labels)):
        label_words_num[label] += len(sentence)
        for word in sentence:
            try:
                word_label_dict[word][label] += 1
            except KeyError:
                word_label_dict[word][label] = 1

    bdc_dict = collections.defaultdict(float)
    dc_dict = collections.defaultdict(float)
    for word in tqdm(word_label_dict):

        # for calc dc
        arr = np.array(list(
            word_label_dict[word].values()))  # f(t, c_i) for all labels
        arr = arr / arr.sum()  # f(t, c_i) / f(t)
        arr = np.log(arr) * arr
        dc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num))  # norm

        # for calc bdc
        for label in word_label_dict[word]:
            word_label_dict[word][label] /= label_words_num[
                label]  # p(t, c_i) = f(t, c_i) / f(c_i)
        arr = np.array(list(
            word_label_dict[word].values()))  # p(t, c_i) for all labels
        arr = arr / arr.sum()  # p(t, c_i) / sum(p(t, c_i))
        arr = np.log(arr) * arr
        bdc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num))  # norm

    # to sort save calculated result
    ju.dump(ju.sort_dict_by_value(bdc_dict), bdc_url)
    ju.dump(ju.sort_dict_by_value(dc_dict), dc_url)
    return bdc_dict
def keepEntityAndSentences(init_urls, save_url):
    """
    :param init_urls: 原来目标文件路径
    :param save_url: 处理完的文件路径
    :return:
    """
    json_data = {}
    for init_url in init_urls:
        file = open(init_url, 'r', encoding='utf-8')
        for line in file:
            data_units = line.split('\t')
            if data_units[0] not in json_data.keys():
                json_data[data_units[0]] = data_units[2]
    json_util.dump(json_data, save_url)
def human_add_question(question_url, save_url):
    """
    :param question_url:
    :param entity:
    :return:
    """
    human_questions_json = {}
    question_data = open(question_url, 'r', encoding="utf-8")
    for line in question_data:
        items = line.strip().split("#")
        if items[0] not in human_questions_json.keys():
            human_questions_json[items[0]] = []
        ques = {}
        ques["question"] = items[1]
        ques["correct"] = items[-1]
        ques["items"] = []
        keys = ["A", "B", "C", "D"]
        for i in range(len(keys)):
            ques["items"].append("{}: {}".format(keys[i], items[2 + i]))
        human_questions_json[items[0]].append(ques)
    json_util.dump(human_questions_json, save_url)
Example #6
0
def gen_vocab_from_data(data_urls):
    """
    given the train, dev, test and aug.txt gen the char vocab
    :param data_urls: train, dev, test and aug.txt
    :return:
    """
    # creat the char_vocab.json file
    char_vocab_url = join("./data/running", "char_vocab.json")
    if os.path.exists(char_vocab_url):
        return
    char_vocab = set()
    print("generating char vocab from {}".format(" ".join(data_urls)))
    for data_url in data_urls:
        with open(data_url, 'r', encoding='utf-8') as f:
            for row in f:
                if row != "\n":
                    _, dna = row.strip("\n").split("\t")
                    char_vocab = char_vocab.union(dna)
    # sorting vocab according alphabet order
    char_vocab = sorted(char_vocab)
    char_vocab = ['<pad>'] + char_vocab
    ju.dump(ju.list_to_dict(char_vocab), char_vocab_url)
Example #7
0
def transfer2json(triplet2content_file):

    triplet2content = json_util.load(triplet2content_file)
    """
        构造nodes.json
    """
    names = []
    nodes = []
    links = []
    for triplet in triplet2content.keys():
        units = triplet.split("#")
        if units[0] not in names:
            names.append(units[0])
        if units[1] not in names:
            names.append(units[2])
    for name in names:
        node = {}
        node['category'] = 0
        node['name'] = name
        nodes.append(node)
    """
        构造links  
    """
    # 保证tripelt唯一性
    triplets = []
    for triplet in triplet2content.keys():
        if triplet not in triplets:
            triplets.append(triplet)
    for triplet in triplets:
        units = triplet.split("#")
        link = {}
        link["source"] = units[0]
        link["target"] = units[2]
        link["name"] = units[1]
        links.append(link)

    # save file
    json_util.dump(nodes, from_project_root("data/analogyKG_nodes.json"))
    json_util.dump(links, from_project_root("data/analogyKG_links.json"))
Example #8
0
File: dataset.py Project: csJd/CRAM
def gen_vocab_from_data(data_urls,
                        pretrained_url,
                        binary=True,
                        update=False,
                        min_count=1):
    """ generate vocabulary and embeddings from data file, generated vocab files will be saved in
        data dir

    Args:
        data_urls: url to data file(s), list or string
        pretrained_url: url to pretrained embedding file
        binary: binary for load word2vec
        update: force to update even vocab file exists
        min_count: minimum count of a word

    Returns:
        generated word embedding url
    """

    if isinstance(data_urls, str):
        data_urls = [data_urls]
    data_dir = os.path.dirname(data_urls[0])
    vocab_url = os.path.join(data_dir, "vocab.json")
    char_vocab_url = os.path.join(data_dir, "char_vocab.json")
    embedding_url = os.path.join(data_dir,
                                 "embeddings.npy") if pretrained_url else None

    if (not update) and os.path.exists(vocab_url):
        print("vocab file already exists")
        return embedding_url

    vocab = set()
    char_vocab = set()
    word_counts = defaultdict(int)
    print("generating vocab from", data_urls)
    for data_url in data_urls:
        with open(data_url, 'r', encoding='utf-8') as data_file:
            for row in data_file:
                if row == '\n':
                    continue
                token = row.split()[0]
                word_counts[token] += 1
                if word_counts[token] > min_count:
                    vocab.add(row.split()[0])
                char_vocab = char_vocab.union(row.split()[0])

    # sorting vocab according alphabet order
    vocab = sorted(vocab)
    char_vocab = sorted(char_vocab)

    # generate word embeddings for vocab
    if pretrained_url is not None:
        print("generating pre-trained embedding from", pretrained_url)
        kvs = KeyedVectors.load_word2vec_format(pretrained_url, binary=binary)
        embeddings = list()
        for word in vocab:
            if word in kvs:
                embeddings.append(kvs[word])
            else:
                embeddings.append(
                    np.random.uniform(-0.25, 0.25, kvs.vector_size)),

    char_vocab = ['<pad', '<unk>'] + char_vocab
    vocab = ['<pad>', '<unk>'] + vocab
    ju.dump(ju.list_to_dict(vocab), vocab_url)
    ju.dump(ju.list_to_dict(char_vocab), char_vocab_url)

    if pretrained_url is None:
        return

    embeddings = np.vstack([
        np.zeros(kvs.vector_size),  # for <pad>
        np.random.uniform(-0.25, 0.25, kvs.vector_size),  # for <unk>
        embeddings
    ])
    np.save(embedding_url, embeddings)
    return embedding_url