def find_similarity(): file_text = [] raw_files = os.listdir(path) for file in raw_files: fp = open(path + '\\' + file, encoding="utf8") filetext = fp.readlines() fp.close() file_text.append(text_preprocessing(" ".join(filetext))) raw_files = os.listdir(files_path) for file in raw_files: fp = open(files_path + '\\' + file, encoding="utf8") filetext = fp.readlines() file_text.append(text_preprocessing(" ".join(filetext))) fp.close() result = raw_files[findSimByTfCos(file_text)] if result != 1000: val = -1 for i in range(0, 2): val = result.find('_', val + 1) return render_template("cmp_files.html", file_name=cmp_query_file_name, folder_name=result[:val], result=cmp_query_file_name + " is similar to " + result[val + 1:]) else: return render_template("cmp_files.html", file_name=cmp_query_file_name, folder_name=result[:val], result="plagiarised")
def get_prediction(self, sentence, start_tag = "[", end_tag = "]"): sentence = text_preprocessing(sentence) seq = self.data_tokenizer.texts_to_sequences([sentence]) seq = seq[0] result = [] insert_end = False insert_start = True result = [] prev_word = None for idx in range(0, len(seq)-NGRAM+1): category = self.model.predict(np.atleast_2d([seq[idx: idx+NGRAM]])) cat = category.argmax() result.append([self.data_index[seq[idx]], self.data_index[seq[idx+1]], cat]) string_result = [] for _ in result: if _[2] == 1 and insert_start: string_result.append(start_tag) string_result.append(_[0]) insert_end = True insert_start = False prev_word = _[0] elif _[2] == 0 and insert_end: string_result.append(_[0]) string_result.append(end_tag) insert_end = False insert_start = True else: string_result.append(_[0]) if result[-1][2] == 1: string_result.append(_[1]) string_result.append(end_tag) else: string_result.append(_[1]) return " ".join(string_result)
def doc2vec_Fun(): df = pd.read_excel('fake_new_dataset.xlsx') df.title = df.title.astype(str) df.text = df.text.astype(str) df['news'] = df['title'] + df['text'] df.drop(labels=['title', 'text'], axis=1, inplace=True) df.drop(labels=['subcategory'], axis=1, inplace=True) list_label = [0, 0, 1, 1, 0] doc = [] for item in df['news']: item = preprocessing.text_preprocessing(item) doc.append(item) if len(doc) == 5: break tokenized_doc = [] for d in doc: tokenized_doc.append(word_tokenize(d.lower())) tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)] model = Doc2Vec(tagged_data, vector_size=100, window=2, min_count=1, workers=4, epochs=100) list_data = [] for index in range(0, len(model.dv)): list_data.append(model.dv[index]) return list_data, list_label
def predict(data): #preprocessing data = preprocessing.define_client_type(data) data = data['client'] + ' ' + data['topic'] + ' ' + data['description'] data = preprocessing.text_preprocessing(data) prediction = model.predict([data]) prediction = le.inverse_transform(prediction) return prediction[0]
def test_model(file, model_svm, vect, le): test_input_data, test_output_data = get_test_data(file) test_input_data = text_preprocessing(test_input_data) test_input_data = vect.transform(test_input_data) test_output_data = le.transform(test_output_data) accuracy = model_svm.score(test_input_data, test_output_data) predicted_value = model_svm.predict(test_input_data) matrix, precision, recall, f1 = evaluating_model(test_output_data, predicted_value) return accuracy, matrix, precision, recall,f1
def train_model(file): input_data, output_data = create_input_output(file) input_data = text_preprocessing(input_data) vect, input_data = vectorize(input_data) le, output_data = label_encoding(output_data) print("Training SVM model:\n") model_svm = model_support_vector_machine(input_data, output_data) print("SVM model trained successfully\n") accuracy = model_svm.score(input_data, output_data) return accuracy, model_svm, vect, le
def get_scoring(self, sentence): sentence = text_preprocessing(sentence) seq = self.data_tokenizer.texts_to_sequences([sentence]) seq = seq[0] word_seq = [self.data_index[_] for _ in seq] word_score = {} for idx in range(0, len(seq)): category = self.model.predict(np.atleast_2d([seq[idx: idx+NGRAM]])) cat = category.argmax() word_score[self.data_index[seq[idx]]] = word_score.get(self.data_index[seq[idx]], 0) + cat return word_seq, word_score
def predict(): if request.method == 'POST': message = request.form['message'] if (len(message) != 0): text = [message] data = preprocessing.text_preprocessing(text) vect = cv.transform(data) my_prediction = clf.predict(vect) else: my_prediction = 2 return render_template('home.html', prediction=my_prediction)
def load_texts_for_dataset(file, dir_name): """ function to load text data for training the network. on entry accepts: file --------- str, file name to download dir_name ----- str, directory where the file is located returns to output: doc ---------- str, read and translated to bottom case text file """ file_name = os.path.join(dir_name, file) with open(file_name, 'r', encoding='utf-8') as file_read: doc = text_preprocessing(file_read.read()) return doc
def main(): # locate and load the dataset dataset_loc = "Dataset/Dataset AlQuran Multilabel.xlsx" dataset = read_dataset(dataset_loc)[:100] # select coloumn terjemahan from dataset en_verses = [row[3] for row in dataset] # preprocessing phase preprocessed_text = [text_preprocessing(verse) for verse in en_verses] # tfidf phase tfidf_matrix = tf_idf(preprocessed_text) # locate and write tfidf matrix output_loc = "Output/test_output.xlsx" write_data(output_loc, tfidf_matrix)
def classify_sentiment(review): review = list(review.split('\n')) review = text_preprocessing(review) review = vect.transform(review) result = le.inverse_transform(model.predict(review)) return result
"Perancangan User Experience Aplikasi Pendukung Evaluasi dan Analisis Proses Pembelajaran untuk Guru Berbasis Android dengan Metode User-Centered Design dan Design Solution", "Pengaruh Kualitas Implementasi Model Pembelajaran Tipe Student Teams Achievements Divisions (STAD) dan Model Pembelajaran Tipe Numbered Head Together (NHT) terhadap Hasil Belajar Siswa Kelas X Program Keahlian Teknik Komputer dan Informatika Mata Pelajaran", "Pengembangan Sistem Manajemen Penjadwalan Les Privat Berbasis Web (Studi Kasus: Naoyuki Academic Center)", "Pemodelan Arsitektur Bisnis Guna Mendukung Bisnis Berkelanjutan Menggunakan Pendekatan Enterprise Architecture (Studi Kasus: Kedai Kopi “Kopi Soe Malang”)", "Prediksi Harga Emas Dengan Menggunakan Metode Average-Based Fuzzy Time Series", "Evaluasi Usability dan Rekomendasi Perbaikan pada Aplikasi E-Kinerja Kabupaten Kediri menggunakan Metode Heuristic Evaluation", "Pengembangan Sistem Manajemen Notulensi dan Dokumentasi Rapat Berbasis Web (Studi Kasus: Jurusan Teknik Informatika Fakultas Ilmu Komputer Universitas Brawijaya)", "Pengembangan Sistem Monitoring Tingkat Stres berbasis Website", "Temu Kembali Informasi Lintas Bahasa Dokumen Berita Bahasa Indonesia-Inggris menggunakan Metode BM25F", "Prediksi Kecenderungan Pelanggan Telat Bayar pada Layanan Pembiayaan Adira Finance Saluran E-Commerce", "Pengembangan Modul Digital Interaktif Berbasis Website menggunakan Kerangka Kerja Borg, Gall, And Gall pada Mata Pelajaran Administrasi Sistem Jaringan di SMK Negeri 12 Malang", "Klasifikasi Jurusan Siswa menggunakan K-Nearest Neighbor dan Optimasi dengan Algoritme Genetika (Studi Kasus: SMAN 1 Wringinanom Gresik)", "Analisis Pengalaman Pengguna Aplikasi Pemesanan Tiket Bioskop menggunakan User Experience Questionnaire (UEQ) dan Heuristic Evaluation (HE)", "Evaluasi dan Perancangan User Experience menggunakan Metode Human Centered Design dan Heuristic Evaluation pada Aplikasi Dunia Games"]] preprocessing_doc = preprocessing.text_preprocessing(document) document_weighting = tfidf.tfIdfCalculation([preprocessing_doc]) # preprocessing_doc = [] # # for d in document: # preprocessing_doc.append(preprocessing.text_preprocessing(d)) # document_weighting = tfidf.tfIdfCalculation(preprocessing_doc) print(document_weighting) # som = som.selfOrganizingMaps(document_weighting, 0.6, 0.5, 10) # print(som) # Visualisasi # X, target = make_blobs(n_samples=30, n_features=2, centers=3)
def main(): # locate and load the dataset dataset_loc = "./Dataset AlQuran Multilabel.xlsx" dataset = read_dataset(dataset_loc)[:200] # select coloumn terjemahan from dataset en_verses = [row[3] for row in dataset] # preprocessing phase preprocessed_text = [text_preprocessing(verse) for verse in en_verses] # tfidf phase tfidf_matrix, vocab = tf_idf(preprocessed_text[:200]) k_fold = 5 pnn = 4 tfidf_matrix_split = chunkIt(tfidf_matrix, k_fold) print tfidf_matrix_split print len(tfidf_matrix_split) akurasi = [] count_all_hamming = 0 for i in range(k_fold): print("Fold -", i + 1) selisih = 0 count_test = 0 for i_label in range(4, 20): test = [] train = [] target_train = [] target_test = [] target_actual = [] label = [row[i_label] for row in dataset] label_split = chunkIt(label, k_fold) for j in range(len(tfidf_matrix_split)): if j == i: test.extend(tfidf_matrix_split[j]) target_test.extend(label_split[j]) else: train.extend(tfidf_matrix_split[j]) target_train.extend(label_split[j]) # count_test = len(test) print("Label -", i_label - 3) target_output = [] for a in range(len(test)): jarak = [] for b in range(len(train)): jarak.append( math.sqrt( sum( np.subtract(np.array(test[a]), np.array(train[b]))**2))) print len(jarak) jarakKlas = [] for b in range(2): #[0,1] tmp_jarakKlas = [] value_a = [] for c in range(len(jarak)): if target_train[c] == b: tmp_jarakKlas.append(jarak[c]) tmp_jarakKlas.sort() value_a.sort() for line in tmp_jarakKlas[:pnn]: jarakKlas.append([b, line]) b = {} for x in jarakKlas: b.setdefault(x[0], []).append(x[1]) avgDict = {} x = 1 for k, v in b.items(): avgDict[k] = sum((1.0 / i) * g for i, g in enumerate(v, 1)) x += 1 min_index, min_value = min( avgDict.items(), key=lambda x: x[1]) #untuk mendapatkan min. index target_actual.append(min_index) for y in range(len(target_test)): if target_actual[y] != target_test[y]: selisih += 1 print selisih print count_test nilai = count_test * 16 hamming_loss = (float(1 / (nilai)) * selisih) print("hamming_loss :", hamming_loss) count_all_hamming += hamming_loss print("") print "hasil Hamming Keseluruhan" avg_hamming = count_all_hamming / k_fold print avg_hamming