Example #1
0
    def test_process(self):

        train_mat = [\
                     # Sequence of characters with no space

                     ['<s>','I','a','m','a','b','o','y'],\
                     ['<s>','Y','o','u','a','r','e','a','g','i','r','l'],\
                     ['<s>','I','a','m','a','g','o','o','d','b','o','y'],\
                     ['<s>','Y','o','u','a','r','e','a','g','o','o','d','g','i','r','l'],\
                     ]

        train_label = [\
                     # Sequence of label tagged to space
                     # [1 == space, 0 == no-space]


                     [0,1,0,1,1,0,0,1],\
                     [0,0,0,1,0,0,1,1,0,0,0,1],\
                     [0,1,0,1,1,0,0,0,1,0,0,1],\
                     [0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1],\
                    ]

        nlp_common = nlp()
        voca = nlp_common.extract_vocabulary(train_mat)
        train_wordseq_mat = []
        for wordseq in train_mat:
            wordseq_mat = nlp_common.set_of_wordseq2matrix(voca, wordseq)
            train_wordseq_mat.append(wordseq_mat)

        crf = CRF(train_wordseq_mat,
                  train_label,
                  hidden_state_labeled=True,
                  hidden_state=2)
        crf.fit(toler=0.001, epoch=30)

        ti1 = nlp_common.set_of_wordseq2matrix(
            voca, ['<s>', 'I', 'a', 'm', 'g', 'o', 'o', 'd'])
        r1 = autotest.eval_predict_one(crf, ti1, [0, 1, 0, 1, 0, 0, 0, 1],
                                       self.logging)

        ti2 = nlp_common.set_of_wordseq2matrix(
            voca, ['<s>', 'Y', 'o', 'u', 'a', 'r', 'e', 'a', 'b', 'o', 'y'])
        r2 = autotest.eval_predict_one(crf, ti2,
                                       [0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1],
                                       self.logging)

        ti3 = nlp_common.set_of_wordseq2matrix(
            voca, ['<s>', 'Y', 'o', 'u', 'a', 'r', 'e', 'g', 'i', 'r', 'l'])
        r3 = autotest.eval_predict_one(crf, ti3,
                                       [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1],
                                       self.logging)

        ti4 = nlp_common.set_of_wordseq2matrix(
            voca, ['<s>', 'I', 'a', 'm', 'g', 'i', 'r', 'l'])
        r4 = autotest.eval_predict_one(crf, ti4, [0, 1, 0, 1, 0, 0, 0, 1],
                                       self.logging)
Example #2
0
    def test_fs_tsv_loader_with_nlp(self):
        sample_words = "sample_data/email/email.tsv"
        self.tlog("loading words => " + sample_words)

        nlp_eng = nlp("eng")
        wordmat_train, wordlabel_train, voca, wordmat_test, wordlabel_test \
          = fs.tsv_loader_with_nlp(sample_words, 0.1, nlp_eng)

        self.tlog('email data voca size : ' + str(len(voca)))
        self.tlog('voca sample : ' + str(voca[:5]))
Example #3
0
    def test_word2vector(self):
        nlp_eng = nlp("eng")

        input_txt = "try to do this one"
        set_vector = nlp_eng.set_of_word2vector(self.voca, input_txt)
        self.tlog(set_vector)
        
        input_txt2 = "It's your problem. big problem. let's try"
        bag_vector = nlp_eng.bag_of_word2vector(self.voca, input_txt2)
        self.tlog(bag_vector)
Example #4
0
    def test_word2vector(self):
        nlp_eng = nlp("eng")

        input_txt = "try to do this one"
        set_vector = nlp_eng.set_of_word2vector(self.voca, input_txt)
        self.tlog(set_vector)

        input_txt2 = "It's your problem. big problem. let's try"
        bag_vector = nlp_eng.bag_of_word2vector(self.voca, input_txt2)
        self.tlog(bag_vector)
Example #5
0
    def test_fs_tsv_loader_with_nlp(self):
        sample_words = "sample_data/email/email.tsv"
        self.tlog("loading words => " + sample_words)

        nlp_eng = nlp("eng")
        wordmat_train, wordlabel_train, voca, wordmat_test, wordlabel_test \
          = fs.tsv_loader_with_nlp(sample_words, 0.1, nlp_eng)

        self.tlog('email data voca size : ' + str(len(voca)))
        self.tlog('voca sample : ' + str(voca[:5]))
Example #6
0
    def test_nlp_extract_vocabulary(self):

        nlp_eng = nlp("eng")
        docs = [\
            "Just try to enjoy it :).",\
            "It's very important for me!",\
            "What is your problem? you look so bad."\
        ]
        self.voca = nlp_eng.extract_vocabulary(docs)
        self.tlog(self.voca)
        assert len(self.voca) == 7
Example #7
0
    def test_nlp_extract_vocabulary(self):

        nlp_eng = nlp("eng")
        docs = [\
            "Just try to enjoy it :).",\
            "It's very important for me!",\
            "What is your problem? you look so bad."\
        ]
        self.voca = nlp_eng.extract_vocabulary(docs)
        self.tlog(self.voca)
        assert len(self.voca) == 7
Example #8
0
 def test_wordseq2matrix(self):
     word_list_array = [\
         ['I','a','m','a','b','o','y'],\
         ['Y','o','u','a','r','e','a','g','i','r','l'],\
         ['I','a','m','a','g','o','o','d','b','o','y'],\
         ['Y','o','u','a','r','e','a','g','o','o','d','g','i','r','l'],\
     ]
     nlp_common = nlp()
     voca = nlp_common.extract_vocabulary(word_list_array)
     word_mat_array = []
     for word_list in word_list_array:
         word_mat = nlp_common.set_of_wordseq2matrix(voca, word_list)
         word_mat_array.append(word_mat)
     self.tlog(word_mat_array)
Example #9
0
    def test_process(self):
        sample_docs = [\
                "hello this is virus mail",\
                "hi this is from friend",\
                "how about buy this virus",\
                "facebook friend contact to you",\
                "I love you baby virus",\
                "what a nice day how about you"\
            ]

        docs_label =\
                ['spam','real','spam','real','spam','real']

        nlp_eng = nlp("eng")

        # extract vocabulary from docs
        voca = nlp_eng.extract_vocabulary(sample_docs)
        self.tlog(voca)
        assert len(voca) == 12

        # convert docs to bag of word vector using vocabulary
        docs_vector = []
        for doc in sample_docs:
            docs_vector.append(nlp_eng.bag_of_word2vector(voca, doc))
        self.tlog(docs_vector)

        # training NaiveBayes
        nbayes = NaiveBayes(docs_vector, docs_label)
        nbayes.fit()

        # test case 1
        tc1 = "this is virus mail"
        tc1_vec = nlp_eng.bag_of_word2vector(voca, tc1)

        self.tlog(tc1)
        self.tlog(tc1_vec)

        r1 = autotest.eval_predict_one(nbayes, tc1_vec, 'spam', self.logging)
        assert r1 == True

        # test case 2
        tc2 = "I love you love"
        tc2_vec = nlp_eng.bag_of_word2vector(voca, tc2)

        self.tlog(tc2)
        self.tlog(tc2_vec)

        r2 = autotest.eval_predict_one(nbayes, tc2_vec, 'spam', self.logging)
        assert r2 == True
Example #10
0
    def test_process(self):
        sample_docs = [
            "hello this is virus mail",
            "hi this is from friend",
            "how about buy this virus",
            "facebook friend contact to you",
            "I love you baby virus",
            "what a nice day how about you",
        ]

        docs_label = ["spam", "real", "spam", "real", "spam", "real"]

        nlp_eng = nlp("eng")

        # extract vocabulary from docs
        voca = nlp_eng.extract_vocabulary(sample_docs)
        self.tlog(voca)
        assert len(voca) == 12

        # convert docs to bag of word vector using vocabulary
        docs_vector = []
        for doc in sample_docs:
            docs_vector.append(nlp_eng.bag_of_word2vector(voca, doc))
        self.tlog(docs_vector)

        # training NaiveBayes
        nbayes = NaiveBayes(docs_vector, docs_label)
        nbayes.fit()

        # test case 1
        tc1 = "this is virus mail"
        tc1_vec = nlp_eng.bag_of_word2vector(voca, tc1)

        self.tlog(tc1)
        self.tlog(tc1_vec)

        r1 = autotest.eval_predict_one(nbayes, tc1_vec, "spam", self.logging)
        assert r1 == True

        # test case 2
        tc2 = "I love you love"
        tc2_vec = nlp_eng.bag_of_word2vector(voca, tc2)

        self.tlog(tc2)
        self.tlog(tc2_vec)

        r2 = autotest.eval_predict_one(nbayes, tc2_vec, "spam", self.logging)
        assert r2 == True
Example #11
0
    def test_process(self):

        nlp_eng = nlp("eng")

        email_data_file = "sample_data/email/email.tsv"
        emailmat_train, emaillabel_train, voca, emailmat_test, emaillabel_test = fs.tsv_loader_with_nlp(
            email_data_file, 0.3, nlp_eng
        )
        self.tlog(voca)

        email_nbayes = NaiveBayes(emailmat_train, emaillabel_train)
        email_nbayes.fit()

        error_rate = autotest.eval_predict(email_nbayes, emailmat_test, emaillabel_test, self.logging)
        self.tlog("spam-mail predict (with NaiveBayes) error rate : " + str(error_rate))

        assert error_rate <= 0.1
Example #12
0
    def test_process(self):

        nlp_eng = nlp("eng_lower")

        email_data_file = "sample_data/email/email.tsv"
        emailmat_train, emaillabel_train, voca, emailmat_test, emaillabel_test \
                = fs.tsv_loader_with_nlp(email_data_file, 0.4, nlp_eng)
        self.tlog(voca)

        email_nbayes = NaiveBayes(emailmat_train, emaillabel_train)
        email_nbayes.fit()

        error_rate = autotest.eval_predict(email_nbayes, emailmat_test,
                                           emaillabel_test, self.logging)
        self.tlog("spam-mail predict (with NaiveBayes) error rate : " +
                  str(error_rate))

        assert error_rate <= 0.1
Example #13
0
    def test_nlp_split(self):

        nlp_eng = nlp("eng")

        sentence = "hello this is virus mail"
        text = "one sentence\ntwo sentence\nthree sentence"

        words = nlp_eng.split2words(sentence)
        words_text = nlp_eng.split2words(text)
        split_sentence = nlp_eng.split2sentence(text)

        self.tlog(words)
        self.tlog(split_sentence)

        assert words[2] == 'mail'
        assert len(words) == 3
        assert split_sentence[1] == "two sentence"
        assert len(split_sentence) == 3
        assert words_text[2] == "two"
        assert len(words_text) == 6
Example #14
0
    def test_nlp_split(self):

        nlp_eng = nlp("eng")

        sentence = "hello this is virus mail"
        text = "one sentence\ntwo sentence\nthree sentence"

        words = nlp_eng.split2words(sentence)
        words_text = nlp_eng.split2words(text)
        split_sentence = nlp_eng.split2sentence(text)

        self.tlog(words)
        self.tlog(split_sentence)

        assert words[2] == 'mail'
        assert len(words) == 3
        assert split_sentence[1] == "two sentence"
        assert len(split_sentence) == 3
        assert words_text[2] == "two"
        assert len(words_text) == 6