Ejemplo n.º 1
0
class StatisticTokenize():

    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_list = SortedWords()
        self.preprocessor = Preprocessor()

    def create_model(self):
        fo = open(self.current_dir + '/../resources/vi_dict.txt', 'r')
        lines_file = fo.read().split('\n')

        self.tag_model = {'word': [],
                          'role': [],
                          'rate': []}
        for line in lines_file:
            if len(line) == 2:
                self.tag_model['role'][-1] = self.tag_model['role'][-1] + '-' + line[-1]
            else:
                self.tag_model['word'].append(line[:-2])
                self.tag_model['role'].append(line[-1])
                self.tag_model['rate'].append(1)

    def read_model(self):
        pd_file = pd.read_csv(self.current_dir + '/../resources/tag_model.csv')
        self.tag_model = {'word': [],
                          'role': [],
                          'rate': []}
        for i in range(len(pd_file['word'])):
            self.tag_model['word'].append(pd_file['word'][i])
            self.tag_model['role'].append(pd_file['role'][i])
            self.tag_model['rate'].append(pd_file['rate'][i])

    def sort_model(self):
        quick_sort = QuickSort()
        self.tag_model = quick_sort.get_dataframe(self.tag_model, ['word', 'role', 'rate'])
        self.sorted_list.set(self.tag_model['word'])

    def read_text_train(self):
        fo = open(self.current_dir + '/../resources/VNESEcorpus.txt', 'r')
        self.text_train = fo.read()
        self.text_train = self.text_train.split('\n')
        self.text_train = self.preprocessor.remove_mark_docs(self.text_train)

    def statistic(self):
        for sentence in self.text_train:
            self.statistic_doc(sentence)

    def statistic_doc(self, text):
        text = text.split(' ')
        for len_word in range(1, 4):
            for i in range(len(text) - len_word + 1):
                word = ' '.join(text[i:i + len_word])
                position = self.sorted_list.find(word)

                if self.sorted_list.words[position] == word:
                    self.tag_model['rate'][position] += 1

    def save_model(self):
        self.tag_model = pd.DataFrame.from_dict(self.tag_model)
        self.tag_model.to_csv(self.current_dir + '/../resources/tag_model.csv', index=False)
Ejemplo n.º 2
0
class StatisticTokenize():
    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_list = SortedWords()
        self.preprocessor = Preprocessor()

    def create_model(self):
        fo = open(self.current_dir + '/../resources/vi_dict.txt', 'r')
        lines_file = fo.read().split('\n')

        self.tag_model = {'word': [], 'role': [], 'rate': []}
        for line in lines_file:
            if len(line) == 2:
                self.tag_model['role'][
                    -1] = self.tag_model['role'][-1] + '-' + line[-1]
            else:
                self.tag_model['word'].append(line[:-2])
                self.tag_model['role'].append(line[-1])
                self.tag_model['rate'].append(1)

    def read_model(self):
        pd_file = pd.read_csv(self.current_dir + '/../resources/tag_model.csv')
        self.tag_model = {'word': [], 'role': [], 'rate': []}
        for i in range(len(pd_file['word'])):
            self.tag_model['word'].append(pd_file['word'][i])
            self.tag_model['role'].append(pd_file['role'][i])
            self.tag_model['rate'].append(pd_file['rate'][i])

    def sort_model(self):
        quick_sort = QuickSort()
        self.tag_model = quick_sort.get_dataframe(self.tag_model,
                                                  ['word', 'role', 'rate'])
        self.sorted_list.set(self.tag_model['word'])

    def read_text_train(self):
        fo = open(self.current_dir + '/../resources/VNESEcorpus.txt', 'r')
        self.text_train = fo.read()
        self.text_train = self.text_train.split('\n')
        self.text_train = self.preprocessor.remove_mark_docs(self.text_train)

    def statistic(self):
        for sentence in self.text_train:
            self.statistic_doc(sentence)

    def statistic_doc(self, text):
        text = text.split(' ')
        for len_word in range(1, 4):
            for i in range(len(text) - len_word + 1):
                word = ' '.join(text[i:i + len_word])
                position = self.sorted_list.find(word)

                if self.sorted_list.words[position] == word:
                    self.tag_model['rate'][position] += 1

    def save_model(self):
        self.tag_model = pd.DataFrame.from_dict(self.tag_model)
        self.tag_model.to_csv(self.current_dir + '/../resources/tag_model.csv',
                              index=False)
Ejemplo n.º 3
0
class DocToVector():

    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_words = SortedWords()
        self.get_dictionary()
        self.preprocess = Preprocessor()

    def get_dictionary(self):
        pd_file = pd.read_csv(self.current_dir + '/../resources/word_frequency.csv')
        self.sorted_words.set(pd_file['word'])

    def add_to_dict_counter(self, docs):
        docs = self.preprocess.split_space(docs)

    def tf_idf(self, frequency_sentence, frequency_docs, num_words):
        return (1 + math.log(frequency_sentence))*math.log(frequency_docs*1.0/num_words)
Ejemplo n.º 4
0
class DocToVector():
    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_words = SortedWords()
        self.get_dictionary()
        self.preprocess = Preprocessor()

    def get_dictionary(self):
        pd_file = pd.read_csv(self.current_dir +
                              '/../resources/word_frequency.csv')
        self.sorted_words.set(pd_file['word'])

    def add_to_dict_counter(self, docs):
        docs = self.preprocess.split_space(docs)

    def tf_idf(self, frequency_sentence, frequency_docs, num_words):
        return (1 + math.log(frequency_sentence)) * math.log(
            frequency_docs * 1.0 / num_words)
Ejemplo n.º 5
0
class NGramSimilarity():

    def __init__(self):
        self.preprocessor = Preprocessor()
        self.current_dir = os.path.dirname(__file__)

    def check(self, doc1, doc2):
        checksum1 = self.three_grams(self.preprocessor.remove_mark(doc1))
        checksum2 = self.three_grams(self.preprocessor.remove_mark(doc2))

        return len(checksum1 & checksum2)* 1.0/min(len(checksum1), len(checksum2))

    def three_grams(self, doc):
        checksums = set()

        doc = doc.split(' ')
        for i in range(len(doc) - 2):
            checksums.add(hashlib.md5(' '.join(doc[i:i + 3])).digest())

        return checksums
Ejemplo n.º 6
0
class NaiveBayesClassify():
    '''
    classified texts using Naive Bayes
    '''
    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_words = SortedWords()
        self.preprocessor = Preprocessor()

    def set_doc_c1(self):
        fo = open(self.current_dir + '/../resources/pos_text.txt', 'r')
        self.text_c1 = fo.read()
        self.text_c1 = self.text_c1.split('\n')
        self.text_c1 = self.preprocessor.remove_mark_docs(self.text_c1)
        fo.close()

    def set_doc_c2(self):
        fo = open(self.current_dir + '/../resources/neg_text.txt', 'r')
        self.text_c2 = fo.read()
        self.text_c2 = self.text_c2.split('\n')
        self.text_c2 = self.preprocessor.remove_mark_docs(self.text_c2)

    def set_doc_classify(self):
        fo = open(self.current_dir + '/../resources/classify_text.txt', 'r')
        self.text_classify = fo.read()
        self.text_classify = self.text_classify.split('\n')
        self.text_classify = self.preprocessor.remove_mark_docs(self.text_classify)
        fo.close()

    def probability_cal(self):
        self.p_c1 = len(self.text_c1)*1.0/(len(self.text_c1) + len(self.text_c2))
        self.p_c2 = 1 - self.p_c1

        self.word = []
        self.tf_c1 = []
        self.tf_c2 = []

        #add to list
        for sentence in self.text_c1:
            s_words = sentence.split(' ')
            for w in s_words:
                self.sorted_words.add(word=w)

        for sentence in self.text_c2:
            s_words = sentence.split(' ')
            for w in s_words:
                self.sorted_words.add(word=w)

        #cal tf
        self.tf_c1 = [0 for i in range(len(self.sorted_words.words))]
        for sentence in self.text_c1:
            s_words = sentence.split(' ')
            for w in s_words:
                self.tf_c1[self.sorted_words.find(w)] += 1

        self.tf_c2 = [0 for i in range(len(self.sorted_words.words))]
        for sentence in self.text_c2:
            s_words = sentence.split(' ')
            for w in s_words:
                self.tf_c2[self.sorted_words.find(w)] += 1

    def train(self):
        sum_tf_c1 = 0
        sum_tf_c2 = 0
        for i in range(len(self.word)):
            sum_tf_c1 += self.tf_c1[i]
            sum_tf_c2 += self.tf_c2[i]

        self.probability_c1 = []
        self.probability_c2 = []
        for i in range(len(self.sorted_words.words)):
            self.probability_c1.append((self.tf_c1[i] + 1.0)/(len(self.sorted_words.words) + sum_tf_c1))
            self.probability_c2.append((self.tf_c2[i] + 1.0)/(len(self.sorted_words.words) + sum_tf_c2))

    def get_rate_probability_c1(self, word):
        position = self.sorted_words.find(word)
        if word == self.sorted_words.words[position]:
            return self.probability_c1[position]/self.probability_c2[position]
        else:
            return 1

    def test(self):
        for sentence in self.text_classify:
            self.test_doc(sentence)

    def test_doc(self, text):
        rate_px_c1 = 1

        s_words = text.split(' ')
        for w in s_words:
            rate_px_c1 *= self.get_rate_probability_c1(w)

        if rate_px_c1*self.p_c1 >= self.p_c2:
            print text + ': belong c1'
        else:
            print text + ': belong c2'
Ejemplo n.º 7
0
 def __init__(self):
     self.current_dir = os.path.dirname(__file__)
     self.sorted_words = SortedWords()
     self.preprocessor = Preprocessor()
Ejemplo n.º 8
0
class NaiveBayesClassify():
    '''
    classified texts using Naive Bayes
    '''
    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_words = SortedWords()
        self.preprocessor = Preprocessor()

    def set_doc_c1(self):
        fo = open(self.current_dir + '/../resources/pos_text.txt', 'r')
        self.text_c1 = fo.read()
        self.text_c1 = self.text_c1.split('\n')
        self.text_c1 = self.preprocessor.remove_mark_docs(self.text_c1)
        fo.close()

    def set_doc_c2(self):
        fo = open(self.current_dir + '/../resources/neg_text.txt', 'r')
        self.text_c2 = fo.read()
        self.text_c2 = self.text_c2.split('\n')
        self.text_c2 = self.preprocessor.remove_mark_docs(self.text_c2)

    def set_doc_classify(self):
        fo = open(self.current_dir + '/../resources/classify_text.txt', 'r')
        self.text_classify = fo.read()
        self.text_classify = self.text_classify.split('\n')
        self.text_classify = self.preprocessor.remove_mark_docs(
            self.text_classify)
        fo.close()

    def probability_cal(self):
        self.p_c1 = len(
            self.text_c1) * 1.0 / (len(self.text_c1) + len(self.text_c2))
        self.p_c2 = 1 - self.p_c1

        self.word = []
        self.tf_c1 = []
        self.tf_c2 = []

        #add to list
        for sentence in self.text_c1:
            s_words = sentence.split(' ')
            for w in s_words:
                self.sorted_words.add(word=w)

        for sentence in self.text_c2:
            s_words = sentence.split(' ')
            for w in s_words:
                self.sorted_words.add(word=w)

        #cal tf
        self.tf_c1 = [0 for i in range(len(self.sorted_words.words))]
        for sentence in self.text_c1:
            s_words = sentence.split(' ')
            for w in s_words:
                self.tf_c1[self.sorted_words.find(w)] += 1

        self.tf_c2 = [0 for i in range(len(self.sorted_words.words))]
        for sentence in self.text_c2:
            s_words = sentence.split(' ')
            for w in s_words:
                self.tf_c2[self.sorted_words.find(w)] += 1

    def train(self):
        sum_tf_c1 = 0
        sum_tf_c2 = 0
        for i in range(len(self.word)):
            sum_tf_c1 += self.tf_c1[i]
            sum_tf_c2 += self.tf_c2[i]

        self.probability_c1 = []
        self.probability_c2 = []
        for i in range(len(self.sorted_words.words)):
            self.probability_c1.append(
                (self.tf_c1[i] + 1.0) /
                (len(self.sorted_words.words) + sum_tf_c1))
            self.probability_c2.append(
                (self.tf_c2[i] + 1.0) /
                (len(self.sorted_words.words) + sum_tf_c2))

    def get_rate_probability_c1(self, word):
        position = self.sorted_words.find(word)
        if word == self.sorted_words.words[position]:
            return self.probability_c1[position] / self.probability_c2[position]
        else:
            return 1

    def test(self):
        for sentence in self.text_classify:
            self.test_doc(sentence)

    def test_doc(self, text):
        rate_px_c1 = 1

        s_words = text.split(' ')
        for w in s_words:
            rate_px_c1 *= self.get_rate_probability_c1(w)

        if rate_px_c1 * self.p_c1 >= self.p_c2:
            print text + ': belong c1'
        else:
            print text + ': belong c2'
Ejemplo n.º 9
0
 def __init__(self):
     self.current_dir = os.path.dirname(__file__)
     self.sorted_words = SortedWords()
     self.preprocessor = Preprocessor()
Ejemplo n.º 10
0
 def __init__(self):
     self.preprocessor = Preprocessor()
     self.current_dir = os.path.dirname(__file__)