Beispiel #1
0
def make_tfidf_feature_100_holdout(row_body_path,
                                   row_stance_path,
                                   row_test_body_path,
                                   row_test_stance_path,
                                   head_save_path,
                                   body_save_path,
                                   stance_save_path,
                                   model_save=True):
    if not os.path.exists(head_save_path) or not os.path.exists(body_save_path) \
            or not os.path.exists(stance_save_path):
        dataset = Dataset(row_body_path, row_stance_path)
        head, body, stance = dataset.read_combine()
        fe = Feature_enginnering(head, body, stance)
        # "tfidf_label_one_hot_train.pkl"
        # 'tfidf_body_feature_train.pkl'
        # 'tfidf_head_feature_train.pkl'
        fe.get_tfidf_vocab_100_holdout(row_test_body_path,
                                       row_test_stance_path)
        if not os.path.exists(head_save_path):
            fe.tfidf_train_head(head_save_path, model_save=model_save)
        if not os.path.exists(body_save_path):
            fe.tfidf_train_body(body_save_path, model_save=model_save)
        if not os.path.exists(stance_save_path):
            fe.tfidf_stance_save(stance_save_path, model_save=model_save)

    print('train_idf_100 feature saved!')
    def get_tfidf_vocab_5000_holdout(self, test_body, test_stance):
        """
        TF-IDF 벡터를 만들기 위한 train_vocab 파일을 반환 하는 메소드
        :return: train용 TF-IDF vocab 파일
        """
        test_dataset = Dataset(test_body, test_stance)
        t_h, t_b = test_dataset.read_tfidf_data()
        test_h = [h for h in t_h]
        test_b = [b for b in t_b]
        train_data = [b + " " + h for b, h in zip(self.body, self.head)]
        train_data.extend(test_b)
        train_data.extend(test_h)

        model = TfidfVectorizer(max_features=5000,
                                ngram_range=(1, 1),
                                stop_words='english',
                                norm='l2',
                                use_idf=False)
        model.fit_transform(train_data)
        if os.path.exists('../pickled_model/tfidf_holdout_vocab.pkl'):
            self.vocab = load_model('../pickled_model/tfidf_holdout_vocab.pkl')
            print('vocab loaded!')
        else:
            self.vocab = model.vocabulary_
            save_model('../pickled_model/tfidf_holdout_vocab.pkl',
                       model.vocabulary_)
            return self.vocab
Beispiel #3
0
def single_flat_LSTM_50d_100(body_path, stance_path, mode):

    GloVe_vectors = load_embedding_pandas(param_dict['GLOVE_ZIP_FILE'], param_dict['GLOVE_FILE'], type="w2v")
    print(GloVe_vectors[:5])
    d_set = Dataset(body_path, stance_path)
    head, body, one_hot_label = d_set.read_combine()
    all = head.tolist()
    all.extend(body.tolist())

    vocab = create_embedding_lookup_pandas(all, param_dict["MAX_NB_WORDS"], param_dict["EMBEDDING_DIM"], GloVe_vectors,
                                           param_dict["EMBEDDING_FILE"], param_dict["VOCAB_FILE"], init_zeros=False,
                                           add_unknown=True, rdm_emb_init=True, tokenizer=nltk.word_tokenize)

    del GloVe_vectors

    concatenated = []
    for i in range(len(head)):
        concatenated.append(head[i] + ". " + body[i])
    sequences = text_to_sequences_fixed_size(concatenated, vocab, param_dict["MAX_SEQ_LENGTH"],
                                             save_full_text=False, take_full_claim=True)

    if mode == 'train':
        with open(FEATURES_DIR + PARAM_DICT_FILENAME, 'wb') as f:
            pickle.dump(param_dict, f, pickle.HIGHEST_PROTOCOL)
        print("Save PARAM_DICT as " + FEATURES_DIR + PARAM_DICT_FILENAME)

    return sequences
    def get_tfidf_vocab_100_holdout(self, test_body, test_stance):
        """
        TF-IDF 벡터를 만들기 위한 train_vocab 파일을 반환 하는 메소드
        :return: train용 TF-IDF vocab 파일
        """
        test_dataset = Dataset(test_body, test_stance)
        t_h, t_b = test_dataset.read_tfidf_data()
        test_h = [h for h in t_h]
        test_b = [b for b in t_b]
        train_data = [b + " " + h for b, h in zip(self.body, self.head)]
        train_data.extend(test_b)
        train_data.extend(test_h)

        model = TfidfVectorizer(max_features=100,
                                ngram_range=(1, 1),
                                stop_words='english',
                                norm='l2',
                                use_idf=False)
        model.fit_transform(train_data)

        self.vocab = model.vocabulary_
        return self.vocab