def proses(test_str): kata_pengganti = "" katadasar = tampil.Tampil_KataDasar() #katadasar = json.JSONDecoder().decode(katadasar) katadasar = [i[0] for i in katadasar] query = preprocessing.PreProcess(test_str) query = query.split() hasil = [] for i in range(len(query)): if query[i] in katadasar: hasil.append(query[i]) else: word = bigram.urut(query[i]) nilai = 0 for j in range(len(katadasar)): kata_urut = bigram.urut(katadasar[j]) nilai_tertinggi = jaccard.compute_jaccard_similarity_score( word, kata_urut) if nilai_tertinggi > nilai: print(kata_urut) print(nilai_tertinggi) nilai = nilai_tertinggi kata_pengganti = katadasar[j] hasil.append(kata_pengganti) #print(katadasar[2]) #hasil.append(bigram.urut(query[i])) spasi = " " hasil = spasi.join(hasil) return hasil
def createDictionary(): wordsAdded = {} cwd = os.getcwd() data = tampil.Tampil_Hadis() folder = "Hadits Bukhari-Muslim/" fileList = [] for i in range(len(data)): fileList.append(data[i][2]) for file in fileList: with open(folder+file, 'r') as f: words = f.read() words = preprocessing.PreProcess(words) words = urut(words) words = urut2(words) for word in words: if word not in wordsAdded.keys(): wordsAdded[word] = [f.name] else: if file not in wordsAdded[word]: wordsAdded[word] += [f.name] #return wordsAdded with open('indexing.txt', 'w') as json_file: json.dump(wordsAdded, json_file)
def preprocess(filename): unknown_word = "UNK".lower() sent_start = "SENTSTART".lower() sent_end = "SENTEND".lower() pad_word = "PADWORD".lower() special_tokens = [sent_start, sent_end, pad_word, unknown_word] pp = PreProcessing() inp_src = data_src + filename inp_data = open(inp_src, "r").readlines() # load vocab inputs = pp.preprocess(inp_data) word_to_idx = pp.word_to_idx idx_to_word = pp.idx_to_word word_to_idx_ctr = pp.word_to_idx_ctr word_counters = pp.word_counters texts = inputs for t in texts: for token in t: if token not in word_to_idx: word_to_idx[token] = word_to_idx_ctr idx_to_word[word_to_idx_ctr] = token word_to_idx_ctr += 1 word_counters[token] = 0 word_counters[token] += 1 # generate sequences sequences = [] for t in texts: tmp = [word_to_idx[sent_start]] for token in t: if token not in word_to_idx: tmp.append(word_to_idx[unknown_word]) else: tmp.append(word_to_idx[token]) tmp.append(word_to_idx[sent_end]) sequences.append(tmp) sequences = pad_sequences(sequences, maxlen=config.max_input_seq_length, padding='pre', truncating='post') # get encoder and decoder input encoder_inputs = np.array(sequences) decoder_outputs = np.array(sequences) pp.word_to_idx = word_to_idx pp.idx_to_word = idx_to_word pp.vocab_size = len(word_to_idx) pp.word_to_idx_ctr = word_to_idx_ctr pp.word_counters = word_counters return pp, encoder_inputs, decoder_outputs
def main(): # params params = {} params['embeddings_dim'] = config.embeddings_dim params['lstm_cell_size'] = config.lstm_cell_size params['max_input_seq_length'] = config.max_input_seq_length params['max_output_seq_length'] = config.max_output_seq_length-1 #inputs are all but last element, outputs are al but first element params['batch_size'] = config.batch_size params['pretrained_embeddings'] = config.use_pretrained_embeddings params['share_encoder_decoder_embeddings'] = config.share_encoder_decoder_embeddings params['use_pointer'] = config.use_pointer params['pretrained_embeddings_path'] = config.pretrained_embeddings_path params['pretrained_embeddings_are_trainable'] = config.pretrained_embeddings_are_trainable params['use_additional_info_from_pretrained_embeddings'] = config.use_additional_info_from_pretrained_embeddings params['max_vocab_size'] = config.max_vocab_size params['do_vocab_pruning'] = config.do_vocab_pruning params['use_reverse_encoder'] = config.use_reverse_encoder params['use_sentinel_loss'] =config.use_sentinel_loss params['lambd'] = config.lambd params['use_context_for_out'] = config.use_context_for_out print("PARAMS:") for key,value in params.items(): print(" -- ",key," = ",value) buckets = { 0:{'max_input_seq_length':params['max_input_seq_length'], 'max_output_seq_length':params['max_output_seq_length']} } #print "buckets = ",buckets # train mode=sys.argv[1] print("mode = ",mode) ########### PREPROCESSING if mode=="preprocessing": # preprocessing print("------------------------------------------------------------------------") preprocessing = PreProcessing() splits =["train","valid","test"] #for split in splits: preprocessing.loadVocab(split) preprocessing.loadVocab('train') if params['do_vocab_pruning']: preprocessing.pruneVocab(max_vocab_size=params['max_vocab_size']) data_seq = {split:preprocessing.loadData(split=split) for split in splits} data = { split:preprocessing.prepareMTData(cur_data) for split,cur_data in data_seq.items() } for split,split_data in data.items(): #print "Split: ",split inp,dinp,dout,dout_inp_matches = split_data #print inp.shape, dinp.shape, dout.shape, dout_inp_matches.shape #print "------------------------------------------------------------------------" #print "" pickle.dump(data,open(data_src + "data.obj",mode="wb")) pickle.dump(preprocessing, open(data_src + "preprocessing.obj",mode="wb") ) return else: data = pickle.load(open(data_src + "data.obj","rb") ) preprocessing = pickle.load(open(data_src + "preprocessing.obj","rb") ) params['vocab_size'] = preprocessing.vocab_size params['preprocessing'] = preprocessing train = data['train'] val = data['valid'] test = data['test'] # DEBUG if mode=="debug": lim = 64 else: lim= int(params['batch_size'] * ( len(train[0])/params['batch_size'])) if lim!=-1: train_encoder_inputs, train_decoder_inputs, train_decoder_outputs, train_decoder_outputs_matching_inputs = train train_encoder_inputs = train_encoder_inputs[:lim] train_decoder_inputs = train_decoder_inputs[:lim] train_decoder_outputs = train_decoder_outputs[:lim] train_decoder_outputs_matching_inputs = train_decoder_outputs_matching_inputs[:lim] train = train_encoder_inputs, train_decoder_inputs, train_decoder_outputs, train_decoder_outputs_matching_inputs #Pretrained embeddibngs if params['pretrained_embeddings']: pretrained_embeddings = pickle.load(open(params['pretrained_embeddings_path'],mode="rb"),encoding='bytes') word_to_idx = preprocessing.word_to_idx encoder_embedding_matrix = np.random.rand( params['vocab_size'], params['embeddings_dim'] ) decoder_embedding_matrix = np.random.rand( params['vocab_size'], params['embeddings_dim'] ) not_found_count = 0 for token,idx in word_to_idx.items(): if token in pretrained_embeddings: encoder_embedding_matrix[idx]=pretrained_embeddings[token] decoder_embedding_matrix[idx]=pretrained_embeddings[token] else: if not_found_count<10: print("No pretrained embedding for (only first 10 such cases will be printed. other prints are suppressed) ",token) not_found_count+=1 #print "not found count = ", not_found_count params['encoder_embeddings_matrix'] = encoder_embedding_matrix params['decoder_embeddings_matrix'] = decoder_embedding_matrix if params['use_additional_info_from_pretrained_embeddings']: additional_count=0 tmp=[] for token in pretrained_embeddings: if token not in preprocessing.word_to_idx: preprocessing.word_to_idx[token] = preprocessing.word_to_idx_ctr preprocessing.idx_to_word[preprocessing.word_to_idx_ctr] = token preprocessing.word_to_idx_ctr+=1 tmp.append(pretrained_embeddings[token]) additional_count+=1 #print "additional_count = ",additional_count params['vocab_size'] = preprocessing.word_to_idx_ctr tmp = np.array(tmp) encoder_embedding_matrix = np.vstack([encoder_embedding_matrix,tmp]) decoder_embedding_matrix = np.vstack([decoder_embedding_matrix,tmp]) #print "decoder_embedding_matrix.shape ",decoder_embedding_matrix.shape #print "New vocab size = ",params['vocab_size'] # TRAIN/DEBUG if mode=='train' or mode=="debug": if mode=="train": training_iters = int(sys.argv[2]) model_name = sys.argv[3] else: training_iters = 5 model_name = "test" params['training_iters'] = training_iters params['model_name'] = model_name train_buckets = {} for bucket,_ in enumerate(buckets): train_buckets[bucket] = train rnn_model = solver.Solver(params,buckets) _ = rnn_model.getModel(params, mode='train',reuse=False, buckets=buckets) rnn_model.trainModel(config=params, train_feed_dict=train_buckets, val_feed_dct=val, reverse_vocab=preprocessing.idx_to_word, do_init=True) # INFERENCE elif mode=="inference": saved_model_path = sys.argv[2] print("saved_model_path = ",saved_model_path) inference_type = sys.argv[3] # greedy / beam print("inference_type = ",inference_type) params['saved_model_path'] = saved_model_path rnn_model = solver.Solver(params, buckets=None, mode='inference') _ = rnn_model.getModel(params, mode='inference', reuse=False, buckets=None) print("----Running inference-----") #val val_encoder_inputs, val_decoder_inputs, val_decoder_outputs, val_decoder_outputs_matching_inputs = val #print "val_encoder_inputs = ",val_encoder_inputs if len(val_decoder_outputs.shape)==3: val_decoder_outputs=np.reshape(val_decoder_outputs, (val_decoder_outputs.shape[0], val_decoder_outputs.shape[1])) decoder_outputs_inference, decoder_ground_truth_outputs = rnn_model.solveAll(params, val_encoder_inputs, val_decoder_outputs, preprocessing.idx_to_word, inference_type=inference_type) validOutFile_name = saved_model_path+".valid.output" original_data_path = data_src + "valid.original.nltktok" BLEUOutputFile_path = saved_model_path + ".valid.BLEU" utilities.getBlue(validOutFile_name, original_data_path, BLEUOutputFile_path, decoder_outputs_inference, decoder_ground_truth_outputs, preprocessing) print("VALIDATION: ",open(BLEUOutputFile_path,"r").read()) #test test_encoder_inputs, test_decoder_inputs, test_decoder_outputs, test_decoder_outputs_matching_inputs = test if len(test_decoder_outputs.shape)==3: test_decoder_outputs=np.reshape(test_decoder_outputs, (test_decoder_outputs.shape[0], test_decoder_outputs.shape[1])) decoder_outputs_inference, decoder_ground_truth_outputs = rnn_model.solveAll(params, test_encoder_inputs, test_decoder_outputs, preprocessing.idx_to_word, inference_type=inference_type) validOutFile_name = saved_model_path+".test.output" original_data_path = data_src + "test.original.nltktok" BLEUOutputFile_path = saved_model_path + ".test.BLEU" utilities.getBlue(validOutFile_name, original_data_path, BLEUOutputFile_path, decoder_outputs_inference, decoder_ground_truth_outputs, preprocessing) print("TEST: ",open(BLEUOutputFile_path,"r").read()) else: print("Please see usage")
def proses_Pencarian(value): waktu_awal = time.time() waktu_prepro_a = time.time() query = preprocessing.PreProcess(value) #Preprocessing query waktu_prepro_b = time.time() waktu_proses_pre = waktu_prepro_b - waktu_prepro_a print("Waktu Proses PrePro query "+str(waktu_proses_pre)) waktu_bigram_a = time.time() value = bigram.proses(query) #Bigram query waktu_bigram_b = time.time() waktu_proses_bigram = waktu_bigram_b - waktu_bigram_a print("Waktu Proses bigram query "+str(waktu_proses_bigram)) typo = '' if value != query: typo = value value = value.split() #Bagi perkata hasil typo/query f = open('indexing.json',) indexing = json.load(f) #Mengecek query di indexing hasil = [] for i in range(len(value)): if value[i] in indexing: hasil = hasil + indexing[value[i]] f.close() doc = [] for x in range(len(hasil)): if hasil[x] not in doc: doc.append(hasil[x]) #Ambil Data dari Database sesuai dengan dokumen yang cocok dengan query data = [] dokumen = tampil.Tampil_Hadis() for k in range(len(doc)): for l in range(len(dokumen)): init = "Hadits Bukhari-Muslim/"+dokumen[l][2] #print(init) if doc[k] == init: data.append(dokumen[l]) #Menghitung nilai jaccard similarity tiap dokumen terhadap query waktu_jaccard_a = time.time() rank = [] folder = "Hadits Bukhari-Muslim/" for j in range(len(data)): alamat = folder+data[j][2] words = jaccard.urut(data[j][3]) nilai = jaccard.compute_jaccard_similarity_score(value, words) result = [alamat, nilai] rank.append(result) waktu_jaccard_b = time.time() waktu_proses_jaccard = waktu_jaccard_b - waktu_jaccard_a print("Waktu Proses jaccard "+str(waktu_proses_jaccard)) #sorting dokumen rank = sorted(rank, key=lambda x: x[1], reverse=True) waktu_akhir = time.time() waktu_proses1 = waktu_akhir - waktu_awal return typo, rank, waktu_proses1