Esempio n. 1
0
def get_term_weight(x_text_arr, max_doc_len, term_weight_file):

    dc_dict = ju.load(term_weight_file)
    term_weights = []
    for x in x_text_arr:
        x_word_list = x.strip().split()
        sen_length = len(x_word_list)
        # 计算文档级别的tf
        tf_dict = collections.defaultdict(int)
        for word in x_word_list:
            tf_dict[word] += 1
        term_weight = [0] * max_doc_len
        for i in range(min(max_doc_len, len(x_word_list))):
            term_weight[i] = tf_dict[x_word_list[i]] / sen_length * dc_dict[
                x_word_list[i]]
        # 进行归一化
        term_weight = np.array(term_weight)
        max_value = term_weight.max()
        min_value = term_weight.min()
        mid_value = max_value - min_value
        if mid_value == 0:  # 加一操作,防止遇到0的现象
            term_weight = [1 for value in term_weight]
        else:
            term_weight = [((value - min_value) / mid_value) * 2 + 1
                           for value in term_weight]

        term_weights.append(term_weight)

    return term_weights
def calc_tf(data_url=DATA_URL, update=False, ngram=1):
    """ calc the tf value of all tokens

    Args:
        data_url: url to data file
        update: update dict even it exists
        ngram: max_n for ngram

    Returns:
        dict: tf dict {word: tf_value}

    """
    level = 'phrase' if 'phrase' in data_url else 'word'
    tf_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_tf.json".format(
            level, ngram))
    if not update and exists(tf_url):
        return ju.load(tf_url)

    tf_dict = collections.defaultdict(int)
    _, sentences = load_raw_data(data_url, ngram=ngram)
    for sentence in tqdm(sentences):
        for word in sentence:
            tf_dict[word] += 1

    ju.dump(ju.sort_dict_by_value(tf_dict, reverse=True), tf_url)
def main():

    json_url = from_project_root("processed_data/entity2contents.json")
    json_data = json_util.load(json_url)

    print(json_data["红楼梦"])
    exit()
def triplet2content_fn(triplets_url, entity2contents_url, save_url):
    """
    :param triplets_url:
    :param entity2contents_url:
    :param data_json_url:
    :return:
    """
    entity2content = json_util.load(entity2contents_url)
    triplet_file = open(triplets_url, 'r', encoding="utf-8")
    triplet2centent = {}
    for line in triplet_file:
        triplet = line.strip().split(' ')
        entity1 = triplet[0]
        relation = triplet[2]
        entity2 = triplet[1]
        # print(entity1)
        # print(entity2)
        # print(entity2content[entity1])
        # exit()
        if entity1 in entity2content[entity1] and entity2 in entity2content[
                entity1]:
            key = entity1 + "#" + relation + "#" + entity2
            if key not in triplet2centent.keys():
                triplet2centent[key] = entity2content[entity1]

    json_util.dump(triplet2centent, save_url)
def calc_bdc(data_url=DATA_URL, update=False, ngram=1):
    """ calc the bdc value of all tokens

    Args:
        data_url: url to data file
        update: update dict even it exists
        ngram: maxn for ngram

    Returns:
        dict: bdc dict {word: bdc_value}

    """
    level = 'phrase' if 'phrase' in data_url else 'word'
    bdc_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_bdc.json".format(
            level, ngram))
    dc_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_dc.json".format(
            level, ngram))
    if not update and exists(bdc_url):
        return ju.load(bdc_url)

    labels, sentences = load_raw_data(data_url, ngram=ngram)
    word_label_dict = collections.defaultdict(dict)  # store f(t, c_i)
    label_words_num = collections.defaultdict(int)  # to store all f(c_i)
    for label, sentence in tqdm(zip(labels, sentences), total=len(labels)):
        label_words_num[label] += len(sentence)
        for word in sentence:
            try:
                word_label_dict[word][label] += 1
            except KeyError:
                word_label_dict[word][label] = 1

    bdc_dict = collections.defaultdict(float)
    dc_dict = collections.defaultdict(float)
    for word in tqdm(word_label_dict):

        # for calc dc
        arr = np.array(list(
            word_label_dict[word].values()))  # f(t, c_i) for all labels
        arr = arr / arr.sum()  # f(t, c_i) / f(t)
        arr = np.log(arr) * arr
        dc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num))  # norm

        # for calc bdc
        for label in word_label_dict[word]:
            word_label_dict[word][label] /= label_words_num[
                label]  # p(t, c_i) = f(t, c_i) / f(c_i)
        arr = np.array(list(
            word_label_dict[word].values()))  # p(t, c_i) for all labels
        arr = arr / arr.sum()  # p(t, c_i) / sum(p(t, c_i))
        arr = np.log(arr) * arr
        bdc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num))  # norm

    # to sort save calculated result
    ju.dump(ju.sort_dict_by_value(bdc_dict), bdc_url)
    ju.dump(ju.sort_dict_by_value(dc_dict), dc_url)
    return bdc_dict
def calc_dc(data_url=DATA_URL, update=False, ngram=1):
    """ calc the dc value of all tokens

    Args:
        data_url: url to data file
        update: update dict even it exists
        ngram: maxn for ngram

    Returns:
        dict: dc dict {word: dc_value}

    """
    level = 'phrase' if 'phrase' in data_url else 'word'
    dc_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_dc.json".format(
            level, ngram))
    if not update and exists(dc_url):
        return ju.load(dc_url)
    calc_bdc(DATA_URL, update=True, ngram=ngram)
    return ju.load(dc_url)
def main(entity):

    # 加载数据
    triplet2content_url = from_project_root(
        "processed_data/triplet2contents.csv")
    triplet2content = json_util.load(triplet2content_url)

    # print(triplet2content.keys())
    # print(triplet2content["朱自清#职业#诗人"])
    # exit()
    # triple
    # triplet = "水浒传#创作年代#元末明初"
    # questions = generatorBytriplet(triplet2content, triplet)
    # print(questions)
    questions = []
    # entity = "红楼梦"
    model_questions = generatorByEntity(triplet2content, entity)
    question_data_file = from_project_root("processed_data/humanQuestion.json")
    human_questions = json_util.load(question_data_file)
    if entity in human_questions.keys():
        questions.extend(human_questions[entity])
    questions.extend(model_questions)
    print(questions)
    return questions
Esempio n. 8
0
def load_raw(data_url):
    save_url = data_url.replace('.dat', '.raw.pkl')
    if os.path.exists(save_url):
        return joblib.load(save_url)
    char_vocab = ju.load(join("./data/running", "char_vocab.json"))
    labels, txts = [], []
    with open(data_url, "r") as f:
        for line in f:
            line_split = line.strip("\n").split("\t")
            labels.append(line_split[1])
            char_index = []
            for char in line_split[0]:
                char_index.append(char_vocab[char])
            txts.append(char_index)
    return txts, labels
def main():

    # 加载数据
    triplet2content_url = from_project_root("processed_data/triplet2contents.csv")
    triplet2content = json_util.load(triplet2content_url)
    # print(triplet2content.keys())
    # exit()
    # triple
    # t = "红楼梦#作者#曹雪芹"
    # # triplet = "水浒传#创作年代#元末明初"
    # questions = generatorBytriplet(triplet2content, triplet)
    # print(questions)
    entity = "红楼梦"
    questions = generatorByEntity(triplet2content, entity)
    print(questions)
    return questions
Esempio n. 10
0
def transfer2json(triplet2content_file):

    triplet2content = json_util.load(triplet2content_file)
    """
        构造nodes.json
    """
    names = []
    nodes = []
    links = []
    for triplet in triplet2content.keys():
        units = triplet.split("#")
        if units[0] not in names:
            names.append(units[0])
        if units[1] not in names:
            names.append(units[2])
    for name in names:
        node = {}
        node['category'] = 0
        node['name'] = name
        nodes.append(node)
    """
        构造links  
    """
    # 保证tripelt唯一性
    triplets = []
    for triplet in triplet2content.keys():
        if triplet not in triplets:
            triplets.append(triplet)
    for triplet in triplets:
        units = triplet.split("#")
        link = {}
        link["source"] = units[0]
        link["target"] = units[2]
        link["name"] = units[1]
        links.append(link)

    # save file
    json_util.dump(nodes, from_project_root("data/analogyKG_nodes.json"))
    json_util.dump(links, from_project_root("data/analogyKG_links.json"))
Esempio n. 11
0
def train(n_epochs=30,
          embedding_url=None,
          char_feat_dim=50,
          freeze=False,
          train_url=TRAIN_URL,
          dev_url=DEV_URL,
          test_url=None,
          max_region=10,
          learning_rate=0.001,
          batch_size=100,
          early_stop=5,
          clip_norm=5,
          device='auto',
          save_only_best = True
          ):
    """ Train deep exhaustive model, Sohrab et al. 2018 EMNLP

    Args:
        n_epochs: number of epochs
        embedding_url: url to pretrained embedding file, set as None to use random embedding
        char_feat_dim: size of character level feature
        freeze: whether to freeze embedding
        train_url: url to train data
        dev_url: url to dev data
        test_url: url to test data for evaluating, set to None for not evaluating
        max_region: max entity region size
        learning_rate: learning rate
        batch_size: batch_size
        early_stop: early stop for training
        clip_norm: whether to perform norm clipping, set to 0 if not need
        device: device for torch
        save_only_best: only save model of best performance
    """

    # print arguments
    arguments = json.dumps(vars(), indent=2)
    print("exhaustive model is training with arguments", arguments)
    device = get_device(device)

    train_set = ExhaustiveDataset(train_url, device=device, max_region=max_region)
    train_loader = DataLoader(train_set, batch_size=batch_size, drop_last=False,
                              collate_fn=train_set.collate_func)

    vocab = ju.load(VOCAB_URL)
    n_words = len(vocab)
    char_vocab = ju.load(VOCAB_URL.replace('vocab', 'char_vocab'))
    n_chars = len(char_vocab)

    model = ExhaustiveModel(
        hidden_size=200,
        n_tags=train_set.n_tags + 1,
        char_feat_dim=char_feat_dim,
        embedding_url=embedding_url,
        bidirectional=True,
        max_region=max_region,
        n_embeddings=n_words,
        n_chars = n_chars,
        embedding_dim=200,
        freeze=freeze
    )

    if device.type == 'cuda':
        print("using gpu,", torch.cuda.device_count(), "gpu(s) available!\n")
        # model = nn.DataParallel(model)
    else:
        print("using cpu\n")
    model = model.to(device)

    criterion = F.cross_entropy
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    max_f1, max_f1_epoch, cnt = 0, 0, 0
    # ignore the padding part when calcuting loss
    tag_weights = torch.Tensor([1] * train_set.n_tags + [0]).to(device)
    best_model_url = None

    # train and evaluate model
    for epoch in range(n_epochs):
        # switch to train mode
        model.train()
        batch_id = 0
        for data, labels, _ in train_loader:
            optimizer.zero_grad()
            outputs = model.forward(*data)
            # use weight parameter to skip padding part
            loss = criterion(outputs, labels, weight=tag_weights)
            loss.backward()
            # gradient clipping
            if clip_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
            optimizer.step()

            endl = '\n' if batch_id % LOG_PER_BATCH == 0 else '\r'
            sys.stdout.write("epoch #%d, batch #%d, loss: %.6f, %s%s" %
                             (epoch, batch_id, loss.item(), datetime.now().strftime("%X"), endl))
            sys.stdout.flush()
            batch_id += 1

        cnt += 1
        # metrics on development set
        dev_metrics = evaluate(model, dev_url)
        if dev_metrics['f1'] > max_f1:
            max_f1 = dev_metrics['f1']
            max_f1_epoch = epoch
            if save_only_best and best_model_url:
                os.remove(best_model_url)
            best_model_url = from_project_root(
                "data/model/exhaustive_model_epoch%d_%f.pt" % (epoch, max_f1))
            torch.save(model, best_model_url)
            cnt = 0

        print("maximum of f1 value: %.6f, in epoch #%d\n" % (max_f1, max_f1_epoch))
        if cnt >= early_stop > 0:
            break
    print('\n')

    if test_url and best_model_url:
        model = torch.load(best_model_url)
        print("best model url:", best_model_url)
        print("evaluating on test dataset:", test_url)
        evaluate(model, test_url)

    print(arguments)
Esempio n. 12
0
File: dataset.py Progetto: csJd/CRAM
def gen_sentence_tensors(sentence_list, device, data_url):
    """ generate input tensors from sentence list

    Args:
        sentence_list: list of raw sentence
        device: torch device
        data_url: data_url used to locate vocab files

    Returns:
        sentences, tensor
        sentence_lengths, tensor
        sentence_words, list of tensor
        sentence_word_lengths, list of tensor
        sentence_word_indices, list of tensor

    """
    vocab = ju.load(dirname(data_url) + '/vocab.json')
    char_vocab = ju.load(dirname(data_url) + '/char_vocab.json')

    sentences = list()
    sentence_words = list()
    sentence_word_lengths = list()
    sentence_word_indices = list()

    unk_idx = 1
    for sent in sentence_list:
        # word to word id
        sentence = torch.LongTensor([
            vocab[word] if word in vocab else unk_idx for word in sent
        ]).to(device)

        # char of word to char id
        words = list()
        for word in sent:
            words.append([
                char_vocab[ch] if ch in char_vocab else unk_idx for ch in word
            ])

        # save word lengths
        word_lengths = torch.LongTensor([len(word)
                                         for word in words]).to(device)

        # sorting lengths according to length
        word_lengths, word_indices = torch.sort(word_lengths, descending=True)

        # sorting word according word length
        words = np.array(words)[word_indices.cpu().numpy()]
        word_indices = word_indices.to(device)
        words = [torch.LongTensor(word).to(device) for word in words]

        # padding char tensor of words
        words = pad_sequence(words, batch_first=True).to(device)
        # (max_word_len, sent_len)

        sentences.append(sentence)
        sentence_words.append(words)
        sentence_word_lengths.append(word_lengths)
        sentence_word_indices.append(word_indices)

    # record sentence length and padding sentences
    sentence_lengths = [len(sentence) for sentence in sentences]
    # (batch_size)
    sentences = pad_sequence(sentences, batch_first=True).to(device)
    # (batch_size, max_sent_len)

    return sentences, sentence_lengths, sentence_words, sentence_word_lengths, sentence_word_indices
Esempio n. 13
0
# coding: utf-8
# created by deng on 7/27/2018

from utils.path_util import from_project_root
import utils.json_util as ju

DATA_URL = from_project_root("processed_data/phrase_level_data_train.csv")

BDC_DICT = ju.load(from_project_root("processed_data/saved_weight/phrase_level_bdc.json"))
DC_DICT = ju.load(from_project_root("processed_data/saved_weight/phrase_level_dc.json"))
TF_DICT = ju.load(from_project_root("processed_data/saved_weight/phrase_level_tf.json"))


def filtered_by_dict(word, dic=BDC_DICT, lower=5., upper=1.e5):
    """ filtering words according their tf

    Args:
        word: the sentence to process
        dic: the dict to use
        lower: lower bound
        upper: upper bound

    Returns:
        bool: True if the word should be filtered

    """
    return dic[word] < lower or dic[word] > upper


def process_data(data_url=DATA_URL, save_url=None):
    """ process data according to specific rules
def pre_processed_sen(bdc_pickle,
                      tf_pickle,
                      dc_pickle,
                      train_file,
                      processed_data_file,
                      limit_word=400):
    """
    :param bdc_pickle: 根据全局计算出来的bdc权重
    :param train_file: 级别上的数据文件,此文件需要初步的去噪.[]
    :param limit_word: 限定每个样本文档不重复词语的个数[阈值]
    :param processed_data_file: 预处理好的文档路径
    :return:
    """
    # 加载bdc_value
    # bdc_dict = ju.load(bdc_pickle)
    # 加载tf_value
    tf_dict = ju.load(tf_pickle)
    # 加载dc_value
    dc_dict = ju.load(dc_pickle)
    line_count = 0
    # 读取训练文档
    with open(train_file, 'r',
              encoding='utf-8') as f, open(processed_data_file,
                                           'w',
                                           encoding='utf-8') as wf:

        for line in f.readlines():
            print("filtered_line={}".format(line_count))
            line_count += 1

            line_list = line.strip().split(',')
            # 预处理完的词语列表
            processed_word_list = []
            # 记录词语的权重
            label = line_list[0]
            word_list = line_list[1].strip().split()

            # 过滤超高词频的词语==========================
            filted_word_list = []
            for word in word_list:
                if tf_dict[word] <= 2 or tf_dict[word] > 7500:
                    continue
                filted_word_list.append(word)

            sen_len = len(filted_word_list)  # 作归一化使用,以免句子的长度影响最后句子级别上的权重

            # 计算句子级别上tf ==============================
            word_dict = collections.defaultdict(float)

            for word in filted_word_list:
                word_dict[word] += 1.0
            # 归一化,计算tf-bdc value =========================
            for (word, tf_value) in word_dict.items():
                word_dict[word] = word_dict[word] / sen_len * dc_dict[word]

            # 对word_dict权重进行排序: 从大到小排序 =============================
            sorted_word_tuple = sorted(word_dict.items(),
                                       key=lambda item: item[1],
                                       reverse=True)

            if len(sorted_word_tuple) < limit_word:  # 如果小于阈值,无需压缩
                processed_word_list = filted_word_list
                wf.write("{},{}\n".format(label,
                                          ' '.join(processed_word_list)))
                continue

            # 截取前limit_word阈值的词语,并将tuple转化成list类型=================================
            keep_words = []
            for (word, tf_bdc_value) in sorted_word_tuple[:limit_word]:
                keep_words.append(word)
            #
            for word in filted_word_list:
                if word in keep_words:
                    processed_word_list.append(word)
            wf.write("{},{}\n".format(label, ' '.join(processed_word_list)))