예제 #1
0
파일: document.py 프로젝트: TechBK/NLP-new
class Document(object):

    def __init__(self):
        self._words_and_freq = BagOfWords()
        #self._vocabulary = BagOfWords()

    def read_document(self,filename):

        try:
            text = open(filename,"r", encoding='utf-8').read()
        except UnicodeDecodeError:
            text = open(filename,"r", encoding='latin-1').read()
        text = text.lower()
        words = re.split("[^\wäöüÄÖÜß]*",text) # what re module?

        #self._number_of_words = 0
        #take word into bag

        for word in words:
            if word != '':
                self._words_and_freq.add_word(word)

    def __str__(self):
        return str(self._words_and_freq)

    def wordAndFreq(self):
        return  self._words_and_freq.bagofwords()