Exemple #1
0
	def calculate_best(self, train, test):
		pos = self.position.astype(bool)
		tfidf = TFIDF(train["Review"])
		tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5)
		tfidf.termIndex = {key:val for i, (key, val) in enumerate(tfidf.termIndex.items()) if pos[i] == True}
		print(f"Selected attributes: {len(tfidf.termIndex)}")
		self.clf = C45(tfidf, train)
		self.clf.train()
		self.best = self.clf.score(tfidf, test)
		return self.best
 def mltrain_fn(self,
                params={
                    'i': None,
                    'remove_zero_tfidf': False,
                    'UI': None
                }):
     train = self.storage.load(f"data/folds/train{params['i'] + 1}.pckl")
     tfidf = TFIDF(train["Review"])
     if params['remove_zero_tfidf']:
         tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.4)
     clf = C45(tfidf, train)
     clf.train()
     return params["i"], clf, tfidf, params['UI'] or None
Exemple #3
0
        return train, test


def import_data(filename):
    print(f"Import {filename}")
    importer = DataImporter(filename)
    return importer.get_data()


storage = Storage()
particle = storage.load('./pickle/pso-1.pckl')
pos = particle.position.astype(bool)
pos = [map_bool(x) for x in pos]
data = storage.load('./pickle/default-1541653057.8427656.pckl')
train_idx, test_idx = fold_data(data)
selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
features = np.array(list(selected_tfidf.termIndex.keys()))
features = features[pos]

new_data = import_data('../data/Avg_55,26.xlsx')
new_data['Review'] = preprocess_data(new_data, features)

storage.save(new_data, f"pickle/preprocessed-{time.time()}.pckl")

# kf = KFold(n_splits=10, shuffle=True, random_state=2)
# for i, (train, test) in enumerate(kf.split(data)):
# 	print("Train optimized")
# 	tfidf = TFIDF(data.iloc[train]["Review"])
# 	tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5)
# 	tfidf.termIndex = {key:val for i, (key, val) in enumerate(tfidf.termIndex.items()) if key in features}
# 	clf = C45(tfidf, data.iloc[train])
import sys, os
cwd = os.getcwd().split("\\")
sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..")

from sklearn.decomposition import PCA
from entities.Storage import Storage
from libs.TFIDF import TFIDF
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

s = Storage()
data = s.load("pickle/default-1541653057.8427656.pckl")
tfidf = TFIDF(data["Review"])
english_labels = {
    "Berdampak positif": "Berdampak positif",
    "Berdampak negatif": "Berdampak negatif",
    "Netral": "Netral"
}
groups = {
    "Berdampak positif": "green",
    "Berdampak negatif": "red",
    "Netral": "blue"
}
translated_labels = [english_labels[label] for label in data["Label"]]
colors = np.array([groups[x] for x in translated_labels])

pca = PCA(n_components=2).fit(tfidf.weights)
data2D = pca.transform(tfidf.weights)
x_std = np.std(data2D[:, 0])
y_std = np.std(data2D[:, 1])
Exemple #5
0
def preprocess_data(data):
    print(f"Preprocess data...")
    preprocessor = Preprocessor()
    result = []
    for i, review in enumerate(data['Review']):
        result.append(" ".join(preprocessor.preprocess(review)))
        print(f"Review {i + 1} preprocessed")
    return result


# data = import_data('../data/Avg_55,26.xlsx')
# data['Review'] = preprocess_data(data)
storage = Storage()
# storage.save(data, f"pickle/default-{time.time()}.pckl")
data = storage.load("pickle/default-1541653057.8427656.pckl")
train, test = fold_data(data)
train_data = data.iloc[train]
test_data = data.iloc[test]

tfidf = TFIDF(train_data['Review'])
num_attrs = len(tfidf.termIndex)
# clf = C45(tfidf, data)
# clf.train()

# score = clf.score(tfidf, test_data)
# print(score) # 0.3630573248407643

pso = PSO(num_attrs, 20, 20, 0.7, 0.5, 0.99)
result = pso.exec(train_data, test_data)
storage.save(result, f"pickle/pso-1.pckl")
Exemple #6
0
import sys, os
cwd = os.getcwd().split("\\")
sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..")

import numpy as np, random, math, pandas as pd
from libs.TFIDF import TFIDF
from entities.Storage import Storage

storage = Storage()
data = storage.load('./pickle/default-1541653057.8427656.pckl')
tfidf = TFIDF(data['Review'])
print(len(tfidf.weights[0]))

df = pd.DataFrame(tfidf.weights)
df.to_excel('tfidf.xlsx')
def map_bool(x):
	if x == 1:
		return True
	return False

def fold_data(data, k = 2):
	kf = KFold(n_splits=k, shuffle=True, random_state=2)
	for train, test in kf.split(data):
		return train, test

train_idx, test_idx = fold_data(data)

pos = particle.position.astype(bool)
pos = [map_bool(x) for x in pos]
selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
features = np.array(list(selected_tfidf.termIndex.keys()))
features = features[pos]

c45 = []
pso_c45 = []

kf = KFold(n_splits=10, shuffle=True, random_state=2)
for i, (train, test) in enumerate(kf.split(data)):
	print("Train optimized")
	tfidf = TFIDF(data.iloc[train]["Review"])
	tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5)
	tfidf.termIndex = {key:val for i, (key, val) in enumerate(tfidf.termIndex.items()) if key in features}
	clf = C45(tfidf, data.iloc[train])
	clf.train()
	result = clf.score(tfidf, data.iloc[test])
    result = []
    for i, review in enumerate(data['Review']):
        result.append(" ".join(
            preprocessor.selected_preprocess(review, selected_attr)))
        print(f"Review {i + 1} preprocessed")
    return result


storage = Storage()

particle = storage.load('./pickle/pso-3.pckl')
data = storage.load('./pickle/default-1541653057.8427656.pckl')
train_idx, test_idx = fold_data(data)
pos = particle.position.astype(bool)
pos = [map_bool(x) for x in pos]
selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
features = np.array(list(selected_tfidf.termIndex.keys()))
features = features[pos]

data = import_data('../data/Avg_55,26.xlsx')
data['Review'] = preprocess_data(data, features)
storage.save(data, f"pickle/selected-3.pckl")
# data = storage.load('./pickle/selected-1542024722.200629.pckl')
for review, label in zip(data["Review"], data["Label"]):
    print(label, review)
tfidf = TFIDF(data["Review"])
english_labels = {
    "Berdampak positif": "Berdampak positif",
    "Berdampak negatif": "Berdampak negatif",
    "Netral": "Netral"
}