def load_and_process_samples(self, pathFilesF, pathFilesM, pathSave, setId): fileUtils = FileUtils(pathSave) if self.loadCSV == False: samplesInClasses = fileUtils.load_object( "samples_in_classes_" + str(setId), "dict") samplesInClassesMA = fileUtils.load_object( "samples_in_classes_MA_" + str(setId), "dict") else: samplesInClasses = self._read_csv(pathFilesF, pathFilesM) fileUtils.save_object(samplesInClasses, "samples_in_classes_" + str(setId), "dict") '''Transforma em meta-atributos''' samplesInClassesMA = self._extract_meta_attributes( samplesInClasses) fileUtils.save_object(samplesInClassesMA, "samples_in_classes_MA_" + str(setId), "dict") return samplesInClasses, samplesInClassesMA
def __init__(self, text): import time start = time.clock() ''' ----------------------------------------------------------------------------------------------------------------------- DEFINICAO DOS PARAMETROS DE CONTROLE ----------------------------------------------------------------------------------------------------------------------- ''' tp = TextProcessing() self.nMaxLengthFreq = 16 # OBS1: Tamanho maximo de palavra a ser considerado na frequencia do tamanho de palavras savePath = "/home/ahirton/Python/gender_classification/outputfiles/" #savePath = "/home/rpasti/workspace/gender_classification/outputfiles/" tagged = tp.tagging([tp.tokenize([text])[0]],savePath,"en")[0] fileUtils = FileUtils(savePath) text = re.sub("http","", text) self.raw = text # print tagged self.PARAGRAPHS = [] self.SENTENCES = [] self.WORDS = [] delimiters = '\n','. \n', '! \n', '?\n', '.\n', '!\n', '?\n', '... \n' #, '... \n'#, ' \n ' #, " .\n", " !\n", ' ?\n' regexPattern = '|'.join(map(re.escape, delimiters)) for paragraph in re.split(regexPattern,self.raw): p = [] # print "" # print paragraph # raw_input(".----------------.FIM DE PARÁGRAFO----------------.") #sentences = tp.tokenize_sentence([paragraph])[0] for sentence in tp.tokenize_sentence([paragraph])[0]: # print "" # print sentence # print tp.tagging(tp.tokenize([sentence])) # raw_input(".---------------..FIM DE FRASE...------.") words = tp.tokenize([sentence])[0] #words = tp.remove_punctuation([words])[0] self.WORDS.extend(words) self.SENTENCES.append(sentence) p.append(words) # print paragraph # print sentence # print words # print self.WORDS # raw_input('XXXXXXXXXXXXXXXXXXXXXXXXXXXXX') self.PARAGRAPHS.append(p) self.C = len(text) self.LOWER = MetaAttributes._count_char(text, "^[a-z_-]*$") self.UPPER = MetaAttributes._count_char(text, "^[A-Z_-]*$") self.NUMBERS = MetaAttributes._count_char(text, "^[\d]*$") self.WHITE = MetaAttributes._count_char(text, "^[ ]*$") self.TAB = MetaAttributes._count_char(text, "^[\t]*$") self.N = len(self.WORDS) self.SIZES = [] self.FREQ = {} for w in self.WORDS: self.SIZES.append(len(w)) self.FREQ = dict(nltk.FreqDist(self.WORDS)) self.V = dict(nltk.FreqDist(self.FREQ.values())) self.VRICH = self.N - len(self.V) self.HXLEGO = [] self.HXDISLEGO = [] for w, t in self.FREQ.items(): if t == 1: self.HXLEGO.append(w) elif t == 2: self.HXDISLEGO.append(w) self.TAGGED = tagged self.S = len(self.SENTENCES) self.pwdictionary = semantic_dictionaries.extended_positive() self.nwdictionary = semantic_dictionaries.extended_negative() self.neutralwdictionary = semantic_dictionaries.extended_neutral_words() self.LIWCdict = fileUtils.load_object("liwc", "dict")