def _clean_content(self, content): original = content scrubber = Scrubber() content = scrubber.scrub(content) if len(content) < len(original) * 0.01: content = original content = content.replace('!important', '') return content
df = pd.read_csv(ppeFinalFile, sep='\031') #Initial Formatting print 'Initial Formatting...' df = format.initial_format(df) #!!!! do initial scrubbing here as well #Target Concept print 'Extracting target concept..' tc = TargetConcept(df) df_target_concept = tc.target_concept(config.has_label) print '\nTarget concept: ' + str(df_target_concept.columns) #Null column scrubbing print 'Scrubbing sparse features..' scrubber = Scrubber(df) scrubber.initial_nullscrubber_percent() print '\nNull scrubbed features: ' + str(scrubber.scrubbed_list) #column typing print '\nLoad column typer keywords...' ct = Typer(df) master_list = ct.column_typer() scrubber.remove(scrubber.scrubbed_list, master_list['cat_list'], master_list['num_list'], master_list['date_list'], master_list['zip_list']) print '\nDates: ' + str(master_list['date_list']) print '\nGeos: ' + str(master_list['zip_list']) #Initial scrubbing print 'Initial scrubbing...' #scrubber.initial_scrubber_abs()
def clean_content(self, content): scrubber = Scrubber() content = scrubber.scrub(content) content = content.replace('!important', '') return content
def clean_content(self, content): scrubber = Scrubber() return scrubber.scrub(content)