def read_dataset_train_head_body(datapath):
    head_train, body_train = get_head_body_tuples(data_path=datapath)
    dataset = {}
    idx = 0
    for head, body in zip(head_train, body_train):
        dataset[idx] = {'head': head, 'body': body}
        idx += 1
    return dataset
Example #2
0
        pkl.dump(combined, open(combined_file_name, 'wb'),
                 pkl.HIGHEST_PROTOCOL)
        print('Saving finish Count head body vectors')
        print('Head : {}\nBody :{}\nCombined : {}\n'.format(
            head_file_name, body_file_name, combined_file_name))


if __name__ == "__main__":
    from Tree_models.utils.get_input_datas import get_head_body_tuples, get_head_body_tuples_test
    model_path = '../../pickled_data'

    max_features = 5000  # 메모리가 터질시 max_features를 낮게 조정

    filename = 'count_1st_' + str(max_features) + '_vecterizer_model.pkl'

    head, body = get_head_body_tuples(data_path='../../data')
    head_test, body_test = get_head_body_tuples_test(data_path='../../data')

    count_vectorizer = CountVector_generator\
        (max_features=max_features, analyzer='word', ngram_range=(1, 3), stop_words='english')

    count_vectorizer.fit(head, body)
    # 저장된 Vecterizer 모델이 있으면 load_model을 사용하면 됨
    count_vectorizer.save_model(model_path=model_path, filename=filename)
    # count_vectorizer.load_model(model_path=model_path, filename=filename)

    # 저장된 TFIDF vector가 있으면 바로 해당 데이터로 training 하면 됨
    # 변환된 train file 저장
    count_vectorizer.transform_and_save_data(head,
                                             body,
                                             save_path=model_path,
Example #3
0
from Tree_models.utils.get_input_datas import get_head_body_tuples, get_head_body_tuples_test, get_y_labels
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from Tree_models.utils.score import report_score
head_train, body_train = get_head_body_tuples()
head_test, body_test = get_head_body_tuples_test()

train_y, test_y = get_y_labels()

count_vec = CountVectorizer(analyzer='word',
                            ngram_range=(1, 1),
                            stop_words='english',
                            max_features=2500)
count_vec.fit([h + ". " + b for h, b in zip(head_train, body_train)])

# count_vocab = count_vec.vocabulary_
print('count_vec ...')
head_train = count_vec.transform(head_train)
body_train = count_vec.transform(body_train)
head_test = count_vec.transform(head_test)
body_test = count_vec.transform(body_test)
print('count_vec finish...')

# print(head_train)
train_data = np.concatenate((head_train.toarray(), body_train.toarray()),
                            axis=1)
test_data = np.concatenate((head_test.toarray(), body_test.toarray()), axis=1)
# print('train Decision tree')
clf = DecisionTreeClassifier()
clf.fit(train_data, train_y)
def save_summation_vectors(data_path,
                           glove_path,
                           glove_file,
                           save_path,
                           dim=50):
    model = load_Glove(glove_path=glove_path, glove_file=glove_file)
    h_train, b_train = get_head_body_tuples(data_path=data_path)
    h_test, b_test = get_head_body_tuples_test(data_path=data_path)

    sum_head_train = []
    sum_body_train = []
    sum_head_test = []
    sum_body_test = []

    for h, b in tqdm(zip(h_train, b_train)):
        words_h = h.split()
        words_b = b.split()
        head_vectors = np.zeros(dim)
        body_vectors = np.zeros(dim)
        for wh, wb in zip(words_h, words_b):
            if wh in model.keys():
                head_vectors += model[wh]
            if wb in model.keys():
                body_vectors += model[wb]
            # print(body_vectors)

        sum_head_train.append(head_vectors)
        sum_body_train.append(body_vectors)
    # exit()
    # break
    for h, b in tqdm(zip(h_test, b_test)):
        words_h = h.split()
        words_b = b.split()
        head_vectors = np.zeros(dim)
        body_vectors = np.zeros(dim)
        for wh, wb in zip(words_h, words_b):
            if wh in model.keys():
                head_vectors += model[wh]
            if wb in model.keys():
                body_vectors += model[wb]

        sum_head_test.append(head_vectors)
        sum_body_test.append(body_vectors)

    sum_head_train, sum_body_train, sum_head_test, sum_body_test = \
        np.array(sum_head_train), np.array(sum_body_train), np.array(sum_head_test), np.array(sum_body_test)
    print(sum_head_train.shape)
    print(sum_body_train.shape)
    print(sum_head_test.shape)
    print(sum_body_test.shape)
    # np.hstack((sum_head_train, sum_body_train))
    # print(np.hstack((sum_head_train, sum_body_train)[0])
    #     # print(np.hstack((sum_head_train, sum_body_train)[1])))
    pkl.dump(
        np.hstack((sum_head_train, sum_body_train)),
        open(save_path + "/glove{}D_sum_head_body_train.pkl".format(dim),
             'wb'), pkl.HIGHEST_PROTOCOL)
    print('file saved {}'.format(save_path +
                                 "/glove200D_sum_head_body_train.pkl"))

    pkl.dump(
        np.hstack((sum_head_test, sum_body_test)),
        open(save_path + "/glove{}D_sum_head_body_test.pkl".format(dim), 'wb'),
        pkl.HIGHEST_PROTOCOL)
    print('file saved {}'.format(save_path +
                                 "/glove200D_sum_head_body_test.pkl"))