def s2s_end2end(filename):
    pkl_name = "data/split/20180405.pkl"
    token_idx_dict, idx_token_dict, ontology_idx_dict, idx_ontology_dict, pretrained_dict, train_token, test_token, train_ontology, test_ontology = pickle.load(
        open(pkl_name, "rb"))
    s2s_core = Seq2seqCore(gpu_device=0,
                           encoder_vocab_size=len(token_idx_dict),
                           decoder_vocab_size=len(ontology_idx_dict))

    index_inputs = list()

    for one_line in open(filename):
        one_line = one_line.strip()
        one_line = one_line.lower()
        assert len(one_line.split("\t")) == 2
        one_line = one_line.split("\t")[0]
        one_line_list = word_tokenize(one_line)
        one_index_input = data_indexer(one_line_list, token_idx_dict)
        index_inputs.append(one_index_input)

    input_dict = feeder_generator(index_inputs)

    for epoch_number in range(1, 300):
        print(epoch_number)
        s2s_core.load("models/20180329/pso/%05d.tfmodel" % epoch_number)
        predict_result = s2s_core.predict(input_dict)
        predict_list = predict_result.sample_id[:, 0:3].tolist()
        f_w = open("data/output/pso_attention_%03d.txt" % epoch_number, "w")
        for one_predict_result in predict_list:
            one_predict_tuple = data_indexer(one_predict_result,
                                             idx_ontology_dict)
            f_w.write("%s\n" % (" ".join(one_predict_tuple)))
        f_w.close()
Exemple #2
0
def feeder_generator(raw_inputs, token_idx_dict, pretrained_dict):
    result_dict = dict()
    max_length = 0
    for one_entry in raw_inputs:
        if len(one_entry) > max_length:
            max_length = len(one_entry)

    x_data = list()
    pretrained_x_data = list()
    x_length = list()
    y_length = list()
    for one_entry in raw_inputs:
        one_pretrained_x = list()
        for one_original_token in one_entry:
            try:
                one_pretrained_x.append(pretrained_dict[one_original_token])
            except Exception:
                one_pretrained_x.append(np.zeros([
                    200,
                ], dtype=np.float32))
        x_length.append(len(one_entry))
        x_data.append(
            data_indexer(one_entry, token_idx_dict) + [0] *
            (max_length + 2 - len(one_entry)))
        pretrained_x_data.append(
            np.concatenate((np.array(one_pretrained_x, dtype=np.float32),
                            np.zeros([max_length + 2 - len(one_entry), 200],
                                     dtype=np.float32)),
                           axis=0))
        y_length.append(5)

    result_dict["encoder_length"] = np.array(x_length, dtype=np.int32)
    result_dict["decoder_length"] = np.array(y_length, dtype=np.int32)
    result_dict["encoder_input"] = np.array(x_data, dtype=np.int32)
    result_dict["encoder_pretrained"] = np.array(pretrained_x_data,
                                                 dtype=np.float32)

    return result_dict
Exemple #3
0
def s2s_split(file_name):
    _, _, ontology_idx_dict, _, _, _, raw_sentences, ontology_results = pickle.load(
        open(file_name, "rb"))

    test_ratio = 0.0014

    test_size = int(math.floor(len(raw_sentences) * test_ratio))

    get_out_flag = False
    while get_out_flag is False:
        randomized_idx = np.random.permutation(len(raw_sentences))
        train_split_idx = randomized_idx[test_size:]
        train_raw_ontology = list()
        for one_idx in train_split_idx:
            train_raw_ontology.append(ontology_results[one_idx])
        ontology_voc_list = list()
        for one_ontology_result in train_raw_ontology:
            for one_ontology in one_ontology_result:
                ontology_voc_list.append(one_ontology)
        ontology_voc_list = list(set(ontology_voc_list))
        new_ontology_idx_dict, _ = dictionary_generator(ontology_voc_list,
                                                        oov_flag=False)
        print(len(new_ontology_idx_dict))
        print(len(ontology_idx_dict))
        if len(new_ontology_idx_dict) == len(ontology_idx_dict):
            get_out_flag = True

    train_raw_token = list()
    for one_idx in train_split_idx:
        train_raw_token.append(raw_sentences[one_idx])
    token_voc_list = list()
    for one_raw_sentence in train_raw_token:
        for one_token in one_raw_sentence:
            token_voc_list.append(one_token)
    token_voc_list = list(set(token_voc_list))

    token_idx_dict, idx_token_dict = dictionary_generator(token_voc_list,
                                                          eos_flag=False)
    ontology_idx_dict, idx_ontology_dict = dictionary_generator(
        ontology_voc_list, oov_flag=False)

    train_token = list()
    for one_raw_sentence in train_raw_token:
        train_token.append(data_indexer(one_raw_sentence, token_idx_dict))

    test_raw_token = list()
    for one_idx in randomized_idx[:test_size]:
        test_raw_token.append(raw_sentences[one_idx])
    test_token = list()
    for one_raw_sentence in test_raw_token:
        test_token.append(data_indexer(one_raw_sentence, token_idx_dict))

    train_ontology = list()
    for one_raw_ontology in train_raw_ontology:
        train_ontology.append(data_indexer(one_raw_ontology,
                                           ontology_idx_dict))

    test_raw_ontology = list()
    for one_idx in randomized_idx[:test_size]:
        test_raw_ontology.append(ontology_results[one_idx])
    test_ontology = list()
    for one_raw_ontology in test_raw_ontology:
        test_ontology.append(data_indexer(one_raw_ontology, ontology_idx_dict))

    split_data = (token_idx_dict, idx_token_dict, ontology_idx_dict,
                  idx_ontology_dict, train_token, test_token, train_ontology,
                  test_ontology)
    pickle.dump(split_data, open("data/split/20180331.pkl", "wb"))
def s2s_preprocess(train_file_name, test_file_name):
    raw_sentences = list()
    ontology_results = list()
    max_length = 0
    for one_line in open(train_file_name):
        one_line = one_line.strip()
        print(one_line)
        if len(one_line.split("\t")) != 2:
            continue
        raw_sentence = one_line.split("\t")[0]
        ontology_string = one_line.split("\t")[1]
        tokenized_list = word_tokenize(raw_sentence)
        if len(tokenized_list) > max_length:
            max_length = len(tokenized_list)
        ontology_tuple = ontology_string.split()
        if len(ontology_tuple) != 3:
            continue
        raw_sentences.append(tokenized_list)
        ontology_results.append(ontology_tuple)

    token_voc_list = list()
    ontology_voc_list = list()

    for one_raw_sentence in raw_sentences:
        for one_token in one_raw_sentence:
            token_voc_list.append(one_token)
    token_voc_list = list(set(token_voc_list))

    for one_ontology_result in ontology_results:
        for one_ontology in one_ontology_result:
            ontology_voc_list.append(one_ontology)
    ontology_voc_list = list(set(ontology_voc_list))

    token_idx_dict, idx_token_dict = dictionary_generator(token_voc_list, eos_flag=False)
    ontology_idx_dict, idx_ontology_dict = dictionary_generator(ontology_voc_list, oov_flag=False)

    token_store_data = list()
    for one_raw_sentence in raw_sentences:
        token_store_data.append(data_indexer(one_raw_sentence, token_idx_dict))

    ontology_store_data = list()
    for one_ontology_result in ontology_results:
        ontology_store_data.append(data_indexer(one_ontology_result, ontology_idx_dict))

    pretrained_dict = dict()

    print("Loading pretrained Word2Vec model ...")
    w2v_embedding_path = "data/w2v/wiki20170101"
    w2v_model = Word2Vec.load(w2v_embedding_path)

    for one_line in open(train_file_name):
        one_line = one_line.strip()
        if len(one_line.split("\t")) != 2:
            continue
        raw_sentence = one_line.split("\t")[0]
        tokenized_list = word_tokenize(raw_sentence)
        for one_token in tokenized_list:
            if one_token not in w2v_model.wv.vocab:
                continue
            pretrained_dict[one_token] = w2v_model[one_token]

    for one_line in open(test_file_name):
        one_line = one_line.strip()
        if len(one_line.split("\t")) != 2:
            continue
        raw_sentence = one_line.split("\t")[0]
        tokenized_list = word_tokenize(raw_sentence)
        for one_token in tokenized_list:
            if one_token not in w2v_model.wv.vocab:
                continue
            pretrained_dict[one_token] = w2v_model[one_token]


    processed_data = (token_idx_dict,
                      idx_token_dict,
                      ontology_idx_dict,
                      idx_ontology_dict,
                      pretrained_dict,
                      token_store_data,
                      ontology_store_data,
                      raw_sentences,
                      ontology_results)
    pickle.dump(processed_data, open("data/preprocessed/20180405.pkl", "wb"))