def load_and_process_samples(self, pathFilesF, pathFilesM, pathSave,
                                 setId):

        fileUtils = FileUtils(pathSave)
        if self.loadCSV == False:
            samplesInClasses = fileUtils.load_object(
                "samples_in_classes_" + str(setId), "dict")
            samplesInClassesMA = fileUtils.load_object(
                "samples_in_classes_MA_" + str(setId), "dict")
        else:
            samplesInClasses = self._read_csv(pathFilesF, pathFilesM)
            fileUtils.save_object(samplesInClasses,
                                  "samples_in_classes_" + str(setId), "dict")
            '''Transforma em meta-atributos'''
            samplesInClassesMA = self._extract_meta_attributes(
                samplesInClasses)
            fileUtils.save_object(samplesInClassesMA,
                                  "samples_in_classes_MA_" + str(setId),
                                  "dict")
        return samplesInClasses, samplesInClassesMA
Esempio n. 2
0
    def __init__(self, text):
        import time
        start = time.clock()
        '''
        -----------------------------------------------------------------------------------------------------------------------
        DEFINICAO DOS PARAMETROS DE CONTROLE
        -----------------------------------------------------------------------------------------------------------------------
        '''        
        tp = TextProcessing()
        
        self.nMaxLengthFreq = 16 
#       OBS1: Tamanho maximo de palavra a ser considerado na frequencia do tamanho de palavras       
        savePath = "/home/ahirton/Python/gender_classification/outputfiles/"
        #savePath = "/home/rpasti/workspace/gender_classification/outputfiles/"
        tagged = tp.tagging([tp.tokenize([text])[0]],savePath,"en")[0]
        fileUtils = FileUtils(savePath)
        
        text = re.sub("http","", text)
        self.raw = text
        
#        print tagged

        self.PARAGRAPHS = []
        self.SENTENCES = []
        self.WORDS = []
        delimiters = '\n','. \n', '! \n', '?\n', '.\n', '!\n', '?\n', '... \n' #, '... \n'#, ' \n ' #, " .\n", " !\n", ' ?\n'
        regexPattern = '|'.join(map(re.escape, delimiters))
       
        for paragraph in re.split(regexPattern,self.raw):        
            p = []
#            print ""
#            print paragraph            
#            raw_input(".----------------.FIM DE PARÁGRAFO----------------.")
            #sentences = tp.tokenize_sentence([paragraph])[0]
            for sentence in tp.tokenize_sentence([paragraph])[0]: 
#                print ""
#                print sentence
#                print tp.tagging(tp.tokenize([sentence]))
#                raw_input(".---------------..FIM DE FRASE...------.")
                words = tp.tokenize([sentence])[0]
                #words = tp.remove_punctuation([words])[0]
                self.WORDS.extend(words)
                self.SENTENCES.append(sentence)
                p.append(words)
#                print paragraph
#                print sentence
#                print words
#                print self.WORDS
#                raw_input('XXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
            self.PARAGRAPHS.append(p)
            self.C = len(text)
            self.LOWER = MetaAttributes._count_char(text, "^[a-z_-]*$")
            self.UPPER = MetaAttributes._count_char(text, "^[A-Z_-]*$")
            self.NUMBERS = MetaAttributes._count_char(text, "^[\d]*$")
            self.WHITE = MetaAttributes._count_char(text, "^[ ]*$")
            self.TAB = MetaAttributes._count_char(text, "^[\t]*$")
            self.N = len(self.WORDS)
            self.SIZES = []
            self.FREQ = {}
        
        for w in self.WORDS:            
            self.SIZES.append(len(w))
            self.FREQ = dict(nltk.FreqDist(self.WORDS))
            self.V = dict(nltk.FreqDist(self.FREQ.values())) 
            self.VRICH = self.N - len(self.V)
            self.HXLEGO = []
            self.HXDISLEGO = []

        for w, t in self.FREQ.items():
            if t == 1:
                self.HXLEGO.append(w)
            elif t == 2:
                self.HXDISLEGO.append(w)
                
            self.TAGGED = tagged
            self.S = len(self.SENTENCES)
            
        self.pwdictionary = semantic_dictionaries.extended_positive()
        self.nwdictionary = semantic_dictionaries.extended_negative()
        self.neutralwdictionary = semantic_dictionaries.extended_neutral_words()
        self.LIWCdict = fileUtils.load_object("liwc", "dict")