Ejemplo n.º 1
0
def make_tfidf_combined_feature_cos_100_holdout(row_body_path, row_stance_path,
                                                row_test_body_path,
                                                row_test_stance_path, head_pkl,
                                                body_pkl, label_path,
                                                tfidf_cos_path):
    print(tfidf_cos_path)
    if not os.path.exists(body_pkl) or not os.path.exists(head_pkl) or \
            not os.path.exists(label_path) or not os.path.exists(tfidf_cos_path):
        print('.pkl files not exist. We will make new TF-IDF .pkl vectors')
        make_tfidf_cos_feature_100_holdout(row_body_path, row_stance_path,
                                           row_test_body_path,
                                           row_test_stance_path, head_pkl,
                                           body_pkl, label_path,
                                           tfidf_cos_path)
        print('.pkl make finish!')

    X_train_body = load_model(body_pkl)
    X_train_head = load_model(head_pkl)
    X_train_cos = load_model(tfidf_cos_path)

    print('shape : ', X_train_body.shape, X_train_head.shape)

    print(X_train_cos.shape)
    X_train = np.concatenate((X_train_head.toarray(), X_train_body.toarray()),
                             axis=1)
    X_train = np.concatenate((X_train, X_train_cos), axis=1)

    return X_train
Ejemplo n.º 2
0
def make_NMF_300_feature(row_body_path,
                         row_stance_path,
                         head_tfidf_pkl,
                         body_tfidf_pkl,
                         label_path,
                         save_nmf_model_path,
                         save_head_path,
                         save_body_path,
                         cos_dist=False):
    if not os.path.exists(head_tfidf_pkl) or not os.path.exists(body_tfidf_pkl) \
            or not os.path.exists(label_path):
        make_tfidf_feature_5000(row_body_path,
                                row_stance_path,
                                head_tfidf_pkl,
                                body_tfidf_pkl,
                                label_path,
                                model_save=True)

    X_tfidf_body = load_model(body_tfidf_pkl)
    X_tfidf_head = load_model(head_tfidf_pkl)

    if not os.path.exists(save_nmf_model_path):
        X_all = np.concatenate(
            (X_tfidf_head.toarray(), X_tfidf_body.toarray()), axis=0)
        print('fit NMF topic model')
        t0 = time()
        nmf = NMF(n_components=300, random_state=1, alpha=.1)
        nmf.fit(X_all)
        print('done in {}'.format(time() - t0))
        save_model(save_nmf_model_path, nmf)

    nmf = load_model(save_nmf_model_path)

    if not os.path.exists(save_head_path) or not os.path.exists(
            save_body_path):
        nmf_head_matrix = nmf.transform(X_tfidf_head)
        nmf_body_matrix = nmf.transform(X_tfidf_body)
        save_model(save_head_path, nmf_head_matrix)
        print('saved model {}'.format(save_head_path))
        save_model(save_body_path, nmf_body_matrix)
        print('saved model {}'.format(save_body_path))

    nmf_head_matrix = load_model(save_head_path)
    nmf_body_matrix = load_model(save_body_path)
    if not cos_dist:
        return np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1)
    else:
        X = []
        for i in range(len(nmf_head_matrix)):
            X_head = np.array(nmf_head_matrix[i]).reshape((1, -1))
            X_body = np.array(nmf_body_matrix[i]).reshape((1, -1))
            cos = cosine_distances(X_head, X_body).flatten()
            X.append(cos.tolist())
        X = np.array(X)
        X_train = np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1)
        X = np.concatenate((X_train, X), axis=1)
        return X
    def get_tfidf_vocab_5000_holdout(self, test_body, test_stance):
        """
        TF-IDF 벡터를 만들기 위한 train_vocab 파일을 반환 하는 메소드
        :return: train용 TF-IDF vocab 파일
        """
        test_dataset = Dataset(test_body, test_stance)
        t_h, t_b = test_dataset.read_tfidf_data()
        test_h = [h for h in t_h]
        test_b = [b for b in t_b]
        train_data = [b + " " + h for b, h in zip(self.body, self.head)]
        train_data.extend(test_b)
        train_data.extend(test_h)

        model = TfidfVectorizer(max_features=5000,
                                ngram_range=(1, 1),
                                stop_words='english',
                                norm='l2',
                                use_idf=False)
        model.fit_transform(train_data)
        if os.path.exists('../pickled_model/tfidf_holdout_vocab.pkl'):
            self.vocab = load_model('../pickled_model/tfidf_holdout_vocab.pkl')
            print('vocab loaded!')
        else:
            self.vocab = model.vocabulary_
            save_model('../pickled_model/tfidf_holdout_vocab.pkl',
                       model.vocabulary_)
            return self.vocab
    def tfidf_cos_save(self, head_path, body_path, filename, model_save=False):
        head = load_model(head_path).toarray()
        body = load_model(body_path).toarray()
        cos = []
        for x, y in zip(head, body):
            x = x.reshape(1, -1)
            y = y.reshape(1, -1)
            value = cosine_similarity(x, y)[0]
            cos.append(value)
        cos = np.array(cos)

        if model_save:
            saved_path = self.save_path + "/" + filename
            print('tfidf_cos saving......')
            save_model(saved_path, cos)
            print('feature saving finished!')
            print('saved path : ', saved_path)
        else:
            return cos
Ejemplo n.º 5
0
def load_tfidf_y(pkl_path):
    return load_model(pkl_path)
mode = 'test'
# mode='fold_test'

embedding = np.load(base_feat_path + "/single_flat_LSTM_50d_100_embedding.npy")
dense_size = 600
dropout_keep_prob = 0.8
fold_size = 5

# if mode == 'train':

X_train_seq = get_sequence_data(train_body, train_stance, 'train')
X_train_feat = make_tfidf_combined_feature_cos_100(train_body, train_stance,
                                                   train_head_dir,
                                                   train_body_dir, train_label,
                                                   train_cos_dir)
y_train = load_model(train_label)
data_shuffle(X_train_seq, seed=12345)
data_shuffle(X_train_feat, seed=12345)
data_shuffle(y_train, seed=12345)

# print(X_train_seq[:2])
# print(y_train[:10])
input_len = len(X_train_feat[0])
# elif mode == 'test':
# X_test_seq = get_sequence_data(test_body, test_stance, 'test')
# X_test_feat = make_tfidf_combined_feature_cos_100(test_body, test_stance, test_head_dir, test_body_dir,
#                                                   test_label, test_cos_dir)
# y_test = load_model(test_label)
#   # print(X_test_seq[:2])
# input_len = len(X_test_feat[0])
graph = tf.Graph()
Ejemplo n.º 7
0
             +str(training_epoch)+"_batch"+str(batch_size)+"_holdout"

# mode = 'train'
mode = 'test'
# mode='fold_test'

embedding = np.load(base_feat_path+"/single_flat_LSTM_50d_100_embedding.npy")
dense_size = 600
dropout_keep_prob = 0.8
fold_size = 5

if mode == 'train':
    X_train_seq = get_sequence_data(train_body, train_stance, 'train')
    X_train_feat = make_tfidf_combined_feature_cos_100_holdout(train_body, train_stance, test_body, test_stance, train_head_dir, train_body_dir,
                                                       train_label, train_cos_dir)
    y_train = load_model(train_label)
    data_shuffle(X_train_seq, seed=12345)
    data_shuffle(X_train_feat, seed=12345)
    data_shuffle(y_train, seed=12345)
    input_len = len(X_train_feat[0])

elif mode == 'test':
    X_test_seq = get_sequence_data(test_body, test_stance, 'test')
    X_test_feat = make_tfidf_combined_feature_cos_100_holdout(test_body, test_stance, train_body, train_stance, test_head_dir, test_body_dir,
                                                      test_label, test_cos_dir)
    y_test = load_model(test_label)
    input_len = len(X_test_feat[0])

graph = tf.Graph()
with graph.as_default():
    X_seq = tf.placeholder(tf.int32, [None, seq_len])