def test_process(self): train_mat = [\ # Sequence of characters with no space ['<s>','I','a','m','a','b','o','y'],\ ['<s>','Y','o','u','a','r','e','a','g','i','r','l'],\ ['<s>','I','a','m','a','g','o','o','d','b','o','y'],\ ['<s>','Y','o','u','a','r','e','a','g','o','o','d','g','i','r','l'],\ ] train_label = [\ # Sequence of label tagged to space # [1 == space, 0 == no-space] [0,1,0,1,1,0,0,1],\ [0,0,0,1,0,0,1,1,0,0,0,1],\ [0,1,0,1,1,0,0,0,1,0,0,1],\ [0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1],\ ] nlp_common = nlp() voca = nlp_common.extract_vocabulary(train_mat) train_wordseq_mat = [] for wordseq in train_mat: wordseq_mat = nlp_common.set_of_wordseq2matrix(voca, wordseq) train_wordseq_mat.append(wordseq_mat) crf = CRF(train_wordseq_mat, train_label, hidden_state_labeled=True, hidden_state=2) crf.fit(toler=0.001, epoch=30) ti1 = nlp_common.set_of_wordseq2matrix( voca, ['<s>', 'I', 'a', 'm', 'g', 'o', 'o', 'd']) r1 = autotest.eval_predict_one(crf, ti1, [0, 1, 0, 1, 0, 0, 0, 1], self.logging) ti2 = nlp_common.set_of_wordseq2matrix( voca, ['<s>', 'Y', 'o', 'u', 'a', 'r', 'e', 'a', 'b', 'o', 'y']) r2 = autotest.eval_predict_one(crf, ti2, [0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1], self.logging) ti3 = nlp_common.set_of_wordseq2matrix( voca, ['<s>', 'Y', 'o', 'u', 'a', 'r', 'e', 'g', 'i', 'r', 'l']) r3 = autotest.eval_predict_one(crf, ti3, [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1], self.logging) ti4 = nlp_common.set_of_wordseq2matrix( voca, ['<s>', 'I', 'a', 'm', 'g', 'i', 'r', 'l']) r4 = autotest.eval_predict_one(crf, ti4, [0, 1, 0, 1, 0, 0, 0, 1], self.logging)
def test_fs_tsv_loader_with_nlp(self): sample_words = "sample_data/email/email.tsv" self.tlog("loading words => " + sample_words) nlp_eng = nlp("eng") wordmat_train, wordlabel_train, voca, wordmat_test, wordlabel_test \ = fs.tsv_loader_with_nlp(sample_words, 0.1, nlp_eng) self.tlog('email data voca size : ' + str(len(voca))) self.tlog('voca sample : ' + str(voca[:5]))
def test_word2vector(self): nlp_eng = nlp("eng") input_txt = "try to do this one" set_vector = nlp_eng.set_of_word2vector(self.voca, input_txt) self.tlog(set_vector) input_txt2 = "It's your problem. big problem. let's try" bag_vector = nlp_eng.bag_of_word2vector(self.voca, input_txt2) self.tlog(bag_vector)
def test_nlp_extract_vocabulary(self): nlp_eng = nlp("eng") docs = [\ "Just try to enjoy it :).",\ "It's very important for me!",\ "What is your problem? you look so bad."\ ] self.voca = nlp_eng.extract_vocabulary(docs) self.tlog(self.voca) assert len(self.voca) == 7
def test_wordseq2matrix(self): word_list_array = [\ ['I','a','m','a','b','o','y'],\ ['Y','o','u','a','r','e','a','g','i','r','l'],\ ['I','a','m','a','g','o','o','d','b','o','y'],\ ['Y','o','u','a','r','e','a','g','o','o','d','g','i','r','l'],\ ] nlp_common = nlp() voca = nlp_common.extract_vocabulary(word_list_array) word_mat_array = [] for word_list in word_list_array: word_mat = nlp_common.set_of_wordseq2matrix(voca, word_list) word_mat_array.append(word_mat) self.tlog(word_mat_array)
def test_process(self): sample_docs = [\ "hello this is virus mail",\ "hi this is from friend",\ "how about buy this virus",\ "facebook friend contact to you",\ "I love you baby virus",\ "what a nice day how about you"\ ] docs_label =\ ['spam','real','spam','real','spam','real'] nlp_eng = nlp("eng") # extract vocabulary from docs voca = nlp_eng.extract_vocabulary(sample_docs) self.tlog(voca) assert len(voca) == 12 # convert docs to bag of word vector using vocabulary docs_vector = [] for doc in sample_docs: docs_vector.append(nlp_eng.bag_of_word2vector(voca, doc)) self.tlog(docs_vector) # training NaiveBayes nbayes = NaiveBayes(docs_vector, docs_label) nbayes.fit() # test case 1 tc1 = "this is virus mail" tc1_vec = nlp_eng.bag_of_word2vector(voca, tc1) self.tlog(tc1) self.tlog(tc1_vec) r1 = autotest.eval_predict_one(nbayes, tc1_vec, 'spam', self.logging) assert r1 == True # test case 2 tc2 = "I love you love" tc2_vec = nlp_eng.bag_of_word2vector(voca, tc2) self.tlog(tc2) self.tlog(tc2_vec) r2 = autotest.eval_predict_one(nbayes, tc2_vec, 'spam', self.logging) assert r2 == True
def test_process(self): sample_docs = [ "hello this is virus mail", "hi this is from friend", "how about buy this virus", "facebook friend contact to you", "I love you baby virus", "what a nice day how about you", ] docs_label = ["spam", "real", "spam", "real", "spam", "real"] nlp_eng = nlp("eng") # extract vocabulary from docs voca = nlp_eng.extract_vocabulary(sample_docs) self.tlog(voca) assert len(voca) == 12 # convert docs to bag of word vector using vocabulary docs_vector = [] for doc in sample_docs: docs_vector.append(nlp_eng.bag_of_word2vector(voca, doc)) self.tlog(docs_vector) # training NaiveBayes nbayes = NaiveBayes(docs_vector, docs_label) nbayes.fit() # test case 1 tc1 = "this is virus mail" tc1_vec = nlp_eng.bag_of_word2vector(voca, tc1) self.tlog(tc1) self.tlog(tc1_vec) r1 = autotest.eval_predict_one(nbayes, tc1_vec, "spam", self.logging) assert r1 == True # test case 2 tc2 = "I love you love" tc2_vec = nlp_eng.bag_of_word2vector(voca, tc2) self.tlog(tc2) self.tlog(tc2_vec) r2 = autotest.eval_predict_one(nbayes, tc2_vec, "spam", self.logging) assert r2 == True
def test_process(self): nlp_eng = nlp("eng") email_data_file = "sample_data/email/email.tsv" emailmat_train, emaillabel_train, voca, emailmat_test, emaillabel_test = fs.tsv_loader_with_nlp( email_data_file, 0.3, nlp_eng ) self.tlog(voca) email_nbayes = NaiveBayes(emailmat_train, emaillabel_train) email_nbayes.fit() error_rate = autotest.eval_predict(email_nbayes, emailmat_test, emaillabel_test, self.logging) self.tlog("spam-mail predict (with NaiveBayes) error rate : " + str(error_rate)) assert error_rate <= 0.1
def test_process(self): nlp_eng = nlp("eng_lower") email_data_file = "sample_data/email/email.tsv" emailmat_train, emaillabel_train, voca, emailmat_test, emaillabel_test \ = fs.tsv_loader_with_nlp(email_data_file, 0.4, nlp_eng) self.tlog(voca) email_nbayes = NaiveBayes(emailmat_train, emaillabel_train) email_nbayes.fit() error_rate = autotest.eval_predict(email_nbayes, emailmat_test, emaillabel_test, self.logging) self.tlog("spam-mail predict (with NaiveBayes) error rate : " + str(error_rate)) assert error_rate <= 0.1
def test_nlp_split(self): nlp_eng = nlp("eng") sentence = "hello this is virus mail" text = "one sentence\ntwo sentence\nthree sentence" words = nlp_eng.split2words(sentence) words_text = nlp_eng.split2words(text) split_sentence = nlp_eng.split2sentence(text) self.tlog(words) self.tlog(split_sentence) assert words[2] == 'mail' assert len(words) == 3 assert split_sentence[1] == "two sentence" assert len(split_sentence) == 3 assert words_text[2] == "two" assert len(words_text) == 6