def __init__(self, document, with_gui=True): self.Object = document.addObject("App::DocumentObjectGroupPython","Case") self.ObjectProxy = _CaseProxy(self.Object) if with_gui: from gui.case import ViewProviderCase self.ViewProvider = ViewProviderCase(self.Object.ViewObject) self.preprocessing = Preprocessing(self.Object, with_gui) self.solve = Solve(self.Object, with_gui) self.postprocessing = Postprocessing(self.Object, with_gui)
def backgroundprocess_preprocessing(keyword, data_preprocessing, username): model = Model() tfidf = TfIdf(model) preprocessing = Preprocessing(model) list_id_tweet = [] list_tweet_stemming = [] list_tfidf = [] try: model.delete_clustered(keyword) for data_tweet in data_preprocessing: id_tweet, tweet_stemming = background_preprocessing( data_tweet, keyword, preprocessing, model) list_id_tweet.append(id_tweet) list_tweet_stemming.append(tweet_stemming) tfidf.word_idf(list_tweet_stemming, keyword) list_idf = model.select_idf(keyword) for index, tweet_steming in enumerate(list_tweet_stemming): id_tweet = list_id_tweet[index] tweet_tfidf = tfidf.tf_idf( preprocessing.tokennizing(tweet_steming), keyword, list_idf) model.update_tfidf(id_tweet, keyword, tweet_tfidf) list_tfidf.append({'id': id_tweet, 'tfidf': tweet_tfidf}) data = sorted(list_tfidf, key=lambda k: k['id']) model.update_vocab_config(data, keyword) except Exception as error: print(error) finally: chache[username]['statuspreprocessing'] = 0 model.close_connection()
class Vocabulary(): def __init__(self, model): self.preprocessing = Preprocessing(model) self.model = model def read_config_vocabulary(self, keyword): config = {'vocab_size': 0, 'data_count': 0} result = self.model.select_vocab_config(keyword) result = result if result else config return result def write_config_vocabulary(self, vocab_size, data_count, keyword): result = self.model.save_vocab_config(vocab_size, data_count, keyword) def write_vocabulary(self, list_word, list_idf, list_total, keyword): self.model.save_vocabulary(list_word, list_idf, list_total, keyword) def read_vocabulary(self, keyword): vocabulary = self.model.select_vocabulary(keyword) return vocabulary def create_vocabulary(self, data, keyword): vocabulary = self.read_vocabulary(keyword) for words in data: words_token = self.preprocessing.tokennizing(words) for word in words_token: if word not in vocabulary: vocabulary.append(word) vocabulary_size = len(vocabulary) data_count = len(data) self.write_config_vocabulary(vocabulary_size, data_count, keyword) return vocabulary
def __init__(self, model): self.preprocessing = Preprocessing(model) self.model = model
# from app.core.auth_utils import get_current_user from app.core.session import SessionData from app.db.session import db_session # from app.crud import tweet_db from app.preprocessing import Preprocessing from app.models.tweet import Tweets, RequestSpecs from .service import TwitterTweetAnalyzer from fastapi import APIRouter session_data = SessionData() preprocessing = Preprocessing() router = APIRouter() @router.post("/", response_model=Tweets, status_code=200) async def TweetAnalyzer(request_specs: RequestSpecs): """ Route handler for the tweet analysis. Validates both request and response model Arguments: request_specs {RequestSpecs} -- Brand_Name Returns: [type] -- response_model(Tweets) """ requested_brand = request_specs.brand twitter_tweet_analyzer = TwitterTweetAnalyzer(requested_brand) tweets = twitter_tweet_analyzer.download_tweets() tweets = twitter_tweet_analyzer.preprocess_tweets() tweets = twitter_tweet_analyzer.add_sentiment_to_tweeets() tweets = twitter_tweet_analyzer.add_most_frequent_words()
def __init__(self, model): self.preprocessing = Preprocessing(model) self.model = model self.tf_idf = TfIdf(model)
class KMeans(): def __init__(self, model): self.preprocessing = Preprocessing(model) self.model = model self.tf_idf = TfIdf(model) def mm_normalize(self, data): result = [] data = np.array(data) svd = TruncatedSVD(n_components=5,n_iter=5) data = svd.fit_transform(data) data = data.tolist() # for a in data: # list_cosine = [] # for b in data: # cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b)) # list_cosine.append(cos_sim) # result.append(list_cosine) return data def mm_normalize_predict(self, data, list_data_train): list_cosine = [] result = [] svd = TruncatedSVD(n_components=5,n_iter=5) svd.fit(list_data_train) list_data_train = svd.transform(list_data_train) a = svd.transform(data) a = a.tolist() # for b in list_data_train: # b = b.tolist() # cos_sim = np.dot(data, b)/(np.linalg.norm(data)*np.linalg.norm(b)) # list_cosine.append(cos_sim.tolist()[0]) return list_cosine def manhatan(self, data_1, data_2): return sum([abs(data_1[i]-data_2[i]) for i in range(len(data_2))]) def euclidean(self, data_1, data_2): return math.sqrt(sum([(data_1[i]-data_2[i])**2 for i in range(len(data_2))])) def jacard(self, data_1, data_2): inter= list(set(data_1) & set(data_2)) I=len(inter) union= list(set(data_1) | set(data_2)) U=len(union) return round(1-(float(I)/U),4) def up_date(self, cluster, all_data, centroid_before, k): new_centroid = [] for centorid in range(k): tweet_dalam_kelas_sekarang = [] for index, kelas in enumerate(cluster): if int(kelas) == int(centorid): tweet_dalam_kelas_sekarang.append(all_data[index]) centroid_tweet_baru = np.array(tweet_dalam_kelas_sekarang).mean(axis=0) if tweet_dalam_kelas_sekarang else [] if centroid_tweet_baru == []: centroid_tweet_baru = centroid_before[centorid] new_centroid.append(centroid_tweet_baru) return new_centroid def hitung_sse(self, tweet1, centroid): result = 0 result = result + math.pow(self.manhatan(tweet1, centroid),2) return result def sse(self, keyword, cluster, k, centroid_text, all_data): sse_total = 0 for centroid in range(k): tweet_dalam_kelas_sekarang = [] centroid_now = centroid_text[centroid] for index, kelas in enumerate(cluster): if kelas == centroid: tweet_dalam_kelas_sekarang.append(all_data[index]) with concurrent.futures.ThreadPoolExecutor(max_workers=200) as executor: myfuture = {executor.submit(self.hitung_sse, tweet1, centroid_now) : tweet1 for tweet1 in tweet_dalam_kelas_sekarang} for future in concurrent.futures.as_completed(myfuture): sse_total = sse_total + future.result() self.write_sse(sse_total, keyword) return sse_total def output(self, id, keyword, cluster, k): dict_final = {} for centorid in range(k): id_tweet_dalam_kelas_sekarang = [] for index2, kelas in enumerate(cluster): if kelas == centorid: id_tweet_dalam_kelas_sekarang.append(id[index2]) dict_final[centorid + 1]=id_tweet_dalam_kelas_sekarang self.write_cluster(dict_final, keyword) def read_sse(self, keyword): sse = self.model.select_sse_args(keyword) return sse def write_sse(self, data, keyword): self.model.save_sse(data, keyword) def read_model(self, keyword): model = self.model.select_centroid_cluster_args(keyword) return model def write_model(self, data, cluster, keyword): self.model.delete_centroid_cluster(keyword) for key, value in data.items(): cluster_name = cluster[key-1] self.model.save_centroid_cluster(value, keyword, key, cluster_name) def read_cluster(self, keyword): cluster_name = self.model.select_cluster_args(keyword) return cluster_name def write_cluster(self, data, keyword): self.model.delete_cluster(keyword) for key, value in data.items(): for i in value: self.model.save_cluster(i, keyword, key) def fit(self, keyword, id, centroids, cluster_name, all_data, jumlah_tweet, k = 3, iterasi=100): centroid_outer = [] centroid_inner = [] cluster_outer = [] cluster_inner = [] dict_model_outer= {} dict_model_inner= {} sse_outer = 0 sse_inner = 0 all_data = self.mm_normalize(all_data) centroid_text = [all_data[(id.index(int(x)))] for x in centroids] for iterat in range(0,int(iterasi)): print("iterasi ke -> ",iterat) cluster = [] for i in range(jumlah_tweet): jarak_antar_centorid = [self.manhatan(all_data[i], centroid_text[j]) for j in range(k)] kelas_terdekat = jarak_antar_centorid.index(min(jarak_antar_centorid)) cluster.append(kelas_terdekat) print(cluster) update_centroid_terbaru = self.up_date(cluster, all_data, centroid_text, k) centroid_text = copy.deepcopy(update_centroid_terbaru) dict_model_inner = {} for index, data in enumerate(centroid_text): dict_model_inner[index+1] = data sse_inner = self.sse(keyword, cluster, k, centroid_text, all_data) if sse_inner >= sse_outer and sse_outer !=0: break; else: cluster_outer = copy.deepcopy(cluster) dict_model_outer = copy.deepcopy(dict_model_inner) sse_outer = copy.deepcopy(sse_inner) self.write_model(dict_model_outer, cluster_name, keyword) self.clustering = self.output(id, keyword, cluster_outer, k) self.sse(keyword, cluster_outer, k, centroid_text, all_data) return 0 def predict(self, data, keyword): try: total_tweets, train_data = self.model.select_data_training(keyword) list_preprocessing = [x['tfidf'] for x in train_data] data_lower, data_regex, data_stopword, data_stemming = self.preprocessing.cleansing(data) data = self.preprocessing.tokennizing(data_stemming) data = [self.tf_idf.tf_idf(data, keyword)] data = self.mm_normalize_predict(data, list_preprocessing) model = self.read_model(keyword) model_nm = self.model.select_cluster_names(keyword) k = len(model) d = [self.manhatan(data, model.get(j+1)) for j in range(k)] cluster = d.index(min(d)) + 1 cluster_name = model_nm.get(cluster) emoticon = self.model.select_emoticon(cluster, keyword) except Exception as error: print(error) finally: self.model.close_connection() return cluster_name, emoticon
def __init__(self, model): self.vocabulary = Vocabulary(model) self.proprecessing = Preprocessing(model) self.model = model
class TfIdf(): def __init__(self, model): self.vocabulary = Vocabulary(model) self.proprecessing = Preprocessing(model) self.model = model def word_idf(self, document, keyword): total_document = len(document) vocabulary = self.vocabulary.create_vocabulary(document, keyword) idf = [] total = [] new_vocabulary = [] for feature in vocabulary: total_this_feature = 0 for data in document: list_word = self.proprecessing.tokennizing(data) if feature in list_word: total_this_feature = total_this_feature + 1 if total_this_feature < 2 or total_this_feature > 50: pass else: idf.append(math.log(total_document / total_this_feature)) total.append(total_this_feature) new_vocabulary.append(feature) self.write_vocabulary(new_vocabulary, idf, total, keyword) def tf(self, word1, data, length_sentences): freq_word = 0 for word2 in data: if word1 == word2: freq_word = freq_word + 1 tf = float(freq_word / length_sentences) return tf def tf_idf(self, data, keyword, list_idf=False): value = [] list_idf = self.read_idf(keyword) if not list_idf else list_idf list_word_text_now = [] list_freq_word_now = [] length_sentences = len(data) for word1 in data: if word1 in list_word_text_now: pass else: tf = self.tf(word1, data, length_sentences) list_word_text_now.append(word1) list_freq_word_now.append(tf) for features in list_idf: word = features['word'] idf = features['idf'] if word in list_word_text_now: freq_word_in_sentences = list_freq_word_now[ list_word_text_now.index(word)] tfidf = float(freq_word_in_sentences) * float(idf) value.append(tfidf) else: value.append(0) return value def write_vocabulary(self, vocabulary, idf, total, keyword): self.vocabulary.write_vocabulary(vocabulary, idf, total, keyword) def read_idf(self, keyword): idf = self.model.select_idf(keyword) return idf