def calculate_best(self, train, test): pos = self.position.astype(bool) tfidf = TFIDF(train["Review"]) tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5) tfidf.termIndex = {key:val for i, (key, val) in enumerate(tfidf.termIndex.items()) if pos[i] == True} print(f"Selected attributes: {len(tfidf.termIndex)}") self.clf = C45(tfidf, train) self.clf.train() self.best = self.clf.score(tfidf, test) return self.best
def mltrain_fn(self, params={ 'i': None, 'remove_zero_tfidf': False, 'UI': None }): train = self.storage.load(f"data/folds/train{params['i'] + 1}.pckl") tfidf = TFIDF(train["Review"]) if params['remove_zero_tfidf']: tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.4) clf = C45(tfidf, train) clf.train() return params["i"], clf, tfidf, params['UI'] or None
return train, test def import_data(filename): print(f"Import {filename}") importer = DataImporter(filename) return importer.get_data() storage = Storage() particle = storage.load('./pickle/pso-1.pckl') pos = particle.position.astype(bool) pos = [map_bool(x) for x in pos] data = storage.load('./pickle/default-1541653057.8427656.pckl') train_idx, test_idx = fold_data(data) selected_tfidf = TFIDF(data.iloc[train_idx]['Review']) features = np.array(list(selected_tfidf.termIndex.keys())) features = features[pos] new_data = import_data('../data/Avg_55,26.xlsx') new_data['Review'] = preprocess_data(new_data, features) storage.save(new_data, f"pickle/preprocessed-{time.time()}.pckl") # kf = KFold(n_splits=10, shuffle=True, random_state=2) # for i, (train, test) in enumerate(kf.split(data)): # print("Train optimized") # tfidf = TFIDF(data.iloc[train]["Review"]) # tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5) # tfidf.termIndex = {key:val for i, (key, val) in enumerate(tfidf.termIndex.items()) if key in features} # clf = C45(tfidf, data.iloc[train])
import sys, os cwd = os.getcwd().split("\\") sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..") from sklearn.decomposition import PCA from entities.Storage import Storage from libs.TFIDF import TFIDF from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt import numpy as np s = Storage() data = s.load("pickle/default-1541653057.8427656.pckl") tfidf = TFIDF(data["Review"]) english_labels = { "Berdampak positif": "Berdampak positif", "Berdampak negatif": "Berdampak negatif", "Netral": "Netral" } groups = { "Berdampak positif": "green", "Berdampak negatif": "red", "Netral": "blue" } translated_labels = [english_labels[label] for label in data["Label"]] colors = np.array([groups[x] for x in translated_labels]) pca = PCA(n_components=2).fit(tfidf.weights) data2D = pca.transform(tfidf.weights) x_std = np.std(data2D[:, 0]) y_std = np.std(data2D[:, 1])
def preprocess_data(data): print(f"Preprocess data...") preprocessor = Preprocessor() result = [] for i, review in enumerate(data['Review']): result.append(" ".join(preprocessor.preprocess(review))) print(f"Review {i + 1} preprocessed") return result # data = import_data('../data/Avg_55,26.xlsx') # data['Review'] = preprocess_data(data) storage = Storage() # storage.save(data, f"pickle/default-{time.time()}.pckl") data = storage.load("pickle/default-1541653057.8427656.pckl") train, test = fold_data(data) train_data = data.iloc[train] test_data = data.iloc[test] tfidf = TFIDF(train_data['Review']) num_attrs = len(tfidf.termIndex) # clf = C45(tfidf, data) # clf.train() # score = clf.score(tfidf, test_data) # print(score) # 0.3630573248407643 pso = PSO(num_attrs, 20, 20, 0.7, 0.5, 0.99) result = pso.exec(train_data, test_data) storage.save(result, f"pickle/pso-1.pckl")
import sys, os cwd = os.getcwd().split("\\") sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..") import numpy as np, random, math, pandas as pd from libs.TFIDF import TFIDF from entities.Storage import Storage storage = Storage() data = storage.load('./pickle/default-1541653057.8427656.pckl') tfidf = TFIDF(data['Review']) print(len(tfidf.weights[0])) df = pd.DataFrame(tfidf.weights) df.to_excel('tfidf.xlsx')
def map_bool(x): if x == 1: return True return False def fold_data(data, k = 2): kf = KFold(n_splits=k, shuffle=True, random_state=2) for train, test in kf.split(data): return train, test train_idx, test_idx = fold_data(data) pos = particle.position.astype(bool) pos = [map_bool(x) for x in pos] selected_tfidf = TFIDF(data.iloc[train_idx]['Review']) features = np.array(list(selected_tfidf.termIndex.keys())) features = features[pos] c45 = [] pso_c45 = [] kf = KFold(n_splits=10, shuffle=True, random_state=2) for i, (train, test) in enumerate(kf.split(data)): print("Train optimized") tfidf = TFIDF(data.iloc[train]["Review"]) tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5) tfidf.termIndex = {key:val for i, (key, val) in enumerate(tfidf.termIndex.items()) if key in features} clf = C45(tfidf, data.iloc[train]) clf.train() result = clf.score(tfidf, data.iloc[test])
result = [] for i, review in enumerate(data['Review']): result.append(" ".join( preprocessor.selected_preprocess(review, selected_attr))) print(f"Review {i + 1} preprocessed") return result storage = Storage() particle = storage.load('./pickle/pso-3.pckl') data = storage.load('./pickle/default-1541653057.8427656.pckl') train_idx, test_idx = fold_data(data) pos = particle.position.astype(bool) pos = [map_bool(x) for x in pos] selected_tfidf = TFIDF(data.iloc[train_idx]['Review']) features = np.array(list(selected_tfidf.termIndex.keys())) features = features[pos] data = import_data('../data/Avg_55,26.xlsx') data['Review'] = preprocess_data(data, features) storage.save(data, f"pickle/selected-3.pckl") # data = storage.load('./pickle/selected-1542024722.200629.pckl') for review, label in zip(data["Review"], data["Label"]): print(label, review) tfidf = TFIDF(data["Review"]) english_labels = { "Berdampak positif": "Berdampak positif", "Berdampak negatif": "Berdampak negatif", "Netral": "Netral" }