Example #1
0
    def _load_file(self, _file, _class, rem_stopwords, stem):
        """
        Implementation of method that opens file, tokenizes it and adds it to
        the corpus.
        """
        #print _file, _class
        in_file = open(_file, 'r')
        text = in_file.read()
        in_file.close()        
        text = text.split()        
        tokens = []        
        tok = Tokenizer()
              
	    ##############Stopword removal############################
        stopwordFile = "stopwords" + self._lang                  
        f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt')
        stopwords = f.read()
        f.close()
        stopwords = stopwords.split()
	    ##########################################################

        #################Stem setup###############################
        stemmer = None
        if stem:

            if self._lang == 'Pt':
                stemmer = RSLP()
            elif self._lang == 'En':
	            stemmer = PorterStemmer()
        ##########################################################
        
        for word in text:
            tokens.extend(tok.fineTokenization(word)) 
        token_dict = {}        
        for token in tokens:
            try:
                token_dict[token.lower()] += 1
            except KeyError:
                token_dict[token.lower()] = 1
        #document is a CorpusDocument object. The docid is the path to the file
        #(_file).
        document = CorpusDocument(_file)
        for key, value in token_dict.iteritems():
            if not (rem_stopwords and key in stopwords):
            
                if stemmer != None:
                    key = stemmer.stem(key)
                
                if key != None:
                	document[self._add_word_to_lex(key)] = value
                
        if self.insert_document(document, _class):
            self._file_number += 1
        else:
            self._repeated_files += 1
Example #2
0
    def load(self, _file, rem_stopwords = True, stem=True, merge = True, \
             class_name = None):
        """
        Abstract method implementation for the txt format
        """
        in_file = open(_file, 'r')
        text = in_file.read()
        in_file.close()     
        text = text.split()        
        tokens = []        
       	tok = Tokenizer()
        
	    ##############Stopword removal############################
        stopwordFile = "stopwords" + self._lang                     
        f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt')
        stopwords = f.read()
        f.close()
        stopwords = stopwords.split()
	    ##########################################################

        #################Stem setup###############################
        stemmer = None
        if stem:

            if self._lang == 'Pt':
                stemmer = RSLP()
            elif self._lang == 'En':
	            stemmer = PorterStemmer()
        ##########################################################
        
        for word in text:
            tokens.extend(tok.fineTokenization(word))            
        token_dict = {}        
        for token in tokens:
            try:
                token_dict[token.lower()] += 1
            except KeyError:
                token_dict[token.lower()] = 1    
        document = CorpusDocument(_file)
        for key, value in token_dict.iteritems():
            if not (rem_stopwords and key in stopwords):
            
                if stemmer != None:
                    key = stemmer.stem(key)
                
                if key != None:
                	document[self._add_word_to_lex(key)] = value
                    
        if not merge:
            self.clear()
            self.insert_document(document, class_name)
        else:
            self.insert_document(document, class_name)
Example #3
0
 def test_tokenize(self):
     """
     Tests the tokenization
     """
     _file = codecs.open(os.path.join(os.environ["TCLASS"], "tests", \
                    "corpora", "corpus1/", "economy", "e1.txt"), 'r', "utf8")
     text = _file.read()
     text = text.split()
     tokens = []
     tok = Tokenizer()     
     for word in text:
         tokens.extend(tok.fineTokenization(word))
     saida = eval(codecs.open(os.path.join(os.environ["TCLASS"], "tests", \
                        "tclass", "tokenizer", "saida"), 'r', "utf8").read())
                        
     print saida, tokens
                        
     assert(saida == tokens)
Example #4
0
    def _load_file(self, _file, _class, rem_stopwords, stem):
        """
        Implementation of method that opens file, tokenizes it and adds it to
        the corpus.
        
        @param _file: file to be loaded
        @param _class: class of the file
        """
        #initialization
        dom = parse(_file)
        filhos = dom.childNodes[0].childNodes
        
	    ##############Stopword removal############################
        stopwordFile = "stopwords" + self._lang                     
        f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt')
        stopwords = f.read()
        f.close()
        stopwords = stopwords.split()
	    ##########################################################

        #################Stem setup###############################
        stemmer = None
        if stem:

            if self._lang == 'Pt':
                stemmer = RSLP()
            elif self._lang == 'En':
	            stemmer = PorterStemmer()
        ##########################################################
        
        body_data = "" 
        i = 3
        while i < (len( filhos )- 1):
            body_data = filhos[i].getElementsByTagName("Resenha")[0].childNodes[0].data
            text = body_data.split()        
            tokens = []
            tok = Tokenizer()        
            for word in text:
                tokens.extend(tok.fineTokenization(word))            
            token_dict = {}
            for token in tokens:
                try:
                    token_dict[token.lower()] += 1
                except KeyError:
                    token_dict[token.lower()] = 1
            #document is a CorpusDocument object. The docid is the path to the file
            #(_file).
            document = CorpusDocument(filhos[i].getAttribute('id'))
            
            for key, value in token_dict.iteritems():
                if not (rem_stopwords and key in stopwords):
            
                    if stemmer != None:
                        key = stemmer.stem(key)
                
                    if key != None:
                	    document[self._add_word_to_lex(key)] = value
                    
            if self.insert_document(document, _class):
                self._file_number += 1
            else:
                self._repeated_files += 1
            body_data = ""
            i += 2
Example #5
0
    def _load_file(self, _file, _class, rem_stopwords, stem):
        """
        Implementation of method that opens file, tokenizes it and adds it to
        the corpus.
        
        @param _file: file to be loaded
        @param _class: class of the file
        """
        #initialization
        handlerbody = ReutersHandler("BODY")
        parserbody = make_parser()
        parserbody.setContentHandler(handlerbody)
        parserbody.parse(_file)
        
	    ##############Stopword removal############################
        stopwordFile = "stopwords" + self._lang                   
        f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt')
        stopwords = f.read()
        f.close()
        stopwords = stopwords.split()
	    ##########################################################

        #################Stem setup###############################
        stemmer = None
        if stem:

            if self._lang == 'Pt':
                stemmer = RSLP()
            elif self._lang == 'En':
	            stemmer = PorterStemmer()
        ##########################################################
        
        body_data = "" 
        for i in range(len(handlerbody.LABEL)):
            body_data = str(handlerbody.LABEL[i])
            text = body_data.split()        
            tokens = []        
            tok = Tokenizer()        
            for word in text:
                tokens.extend(tok.fineTokenization(word))            
            token_dict = {}
            for token in tokens:
                try:
                    token_dict[token.lower()] += 1
                except KeyError:
                    token_dict[token.lower()] = 1
            #document is a CorpusDocument object. The docid is the path to the
            #file (_file).
            document = CorpusDocument(_file)
            for key, value in token_dict.iteritems():
                if not (rem_stopwords and key in stopwords):
            
                    if stemmer != None:
                        key = stemmer.stem(key)
                
                    if key != None:
                	    document[self._add_word_to_lex(key)] = value
                    
            if self.insert_document(document, _class):
                self._file_number += 1
            else:
                self._repeated_files += 1
            body_data = ""