def __build_dictionary(self): print('Building dictionary') dict_words = [] i = 0 for text in self.data: i += 1 print("Dictionary Step {} / {}".format(i, len(self.data))) words = NLP(text = text['content']).get_words_feature() dict_words.append(words) FileStore(filePath=settings.DICTIONARY_PATH).store_dictionary(dict_words)
json_test = DataLoader(dataPath=settings.DATA_TEST_PATH).get_json() print('Load Data to JSON Done! '), str(datetime.now()) # Feature Extraction print('Featuring Extraction... '), str(datetime.now()) # tf-idf features_train, labels_train = FeatureExtraction( data=json_train).get_data_and_label_tfidf() features_test, labels_test = FeatureExtraction( data=json_test).get_data_and_label_tfidf() vectorizer = TfidfVectorizer( use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1, 2)) features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test) FileStore(filePath=settings.VECTOR_EMBEDDING).save_pickle(obj=vectorizer) print('Feature Extraction Done! '), str(datetime.now()) # Save feature extraction features_train_dict = get_feature_dict( value_features=features_train, value_labels=labels_train) features_test_dict = get_feature_dict( value_features=features_test, value_labels=labels_test) FileStore(filePath=settings.FEATURES_TRAIN).save_pickle( obj=features_train_dict) FileStore(filePath=settings.FEATURES_TEST).save_pickle( obj=features_test_dict) print("Store data DONE!")
def save_model(self, filePath): print('Saving Model... ', str(datetime.now())) FileStore(filePath=filePath).save_pickle(obj=self.estimator) print('Save Model Done! ', str(datetime.now()))
self.__build_dataset() return self.features, self.labels def read_feature(self): return self.data['features'] , self.data['labels'] def get_feature_dict(value_features,value_labels): return { "features":value_features, "labels":value_labels } if __name__ == '__main__': print('Reading data raw... ', str(datetime.now())) json_train = DataLoader(dataPath=settings.DATA_TRAIN_PATH).get_json_train() FileStore(filePath=settings.DATA_TRAIN_JSON, data=json_train).store_json() json_test = DataLoader(dataPath=settings.DATA_TEST_PATH).get_json_test(2000) FileStore(filePath=settings.DATA_TEST_JSON, data=json_test).store_json() print('Load Data to JSON Done! ', str(datetime.now())) # Load data after preprocess # train_loader = FileReader(filePath=settings.DATA_TRAIN_JSON) # test_loader = FileReader(filePath=settings.DATA_TEST_JSON) # json_train = train_loader.read_json() # json_test = test_loader.read_json() # Feature Extraction print('Featuring Extraction... ', str(datetime.now())) # Bow # features_train, labels_train = FeatureExtraction(data=json_train).get_data_and_label_bow() # features_test, labels_test = FeatureExtraction(data=json_test).get_data_and_label_bow()