def preprocess(self, join_type='inner', encode=False, interaction=0): ground_truth=self.ground_truth labels_key_col=self.labels_key_col labels_value_col=self.labels_value_col clusteral_features=self.clusteral_features clusteral_key_col=self.clusteral_key_col lf=gl.SFrame.read_csv(ground_truth)[[labels_key_col,labels_value_col]] print 'Shape of the labels file is ', lf.shape cf=gl.SFrame.read_csv(clusteral_features).rename({clusteral_key_col:labels_key_col}) print 'Shape of the Clusteral file is ', cf.shape cf=cf[[each for each in cf.column_names() if each not in ['old_cid','orig_cid','gend'] and 'alter' not in each] ] print 'Filling NAs' for col in cf.column_names(): cf=cf.fillna(col,gl.Sketch(cf[col]).quantile(0.5)) #cf=self.normalize(cf,self.ntype) self.merged_sf=lf.join(cf , on=labels_key_col, how=join_type).fillna(labels_value_col,0) print 'Shape of the merged file is ', self.merged_sf.shape print self.merged_sf[labels_value_col].sketch_summary() if encode: self.merged_sf[labels_value_col]=self.merged_sf[labels_value_col].apply(lambda x:self.util.bin_encode(x)) self.merged_sf[labels_value_col].head(2) print self.merged_sf[labels_value_col].sketch_summary() if interaction: if self.baseline==0: regexp=re.compile('\d{2,}') # Filtering out the spectal features with two digits in it. As the spectral digits are in reverse order of importance, so first of all I want to filter the feature with two or more digits then I want to reverse sort them ic_norm=[each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each and 'norm' in each] ic_norm_in=sorted([each for each in ic_norm if 'in' in each and re.search(regexp,each)], reverse=True)[:5] ic_norm_out=sorted([each for each in ic_norm if 'out' in each and re.search(regexp,each)], reverse=True)[:5] ic_unnorm=[each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each and 'norm' not in each] ic_unnorm_in=sorted([each for each in ic_unnorm if 'in' in each and re.search(regexp,each)], reverse=True)[:5] ic_unnorm_out=sorted([each for each in ic_unnorm if 'out' in each and re.search(regexp,each)],reverse=True)[:5] quad_norm = fe.create(self.merged_sf, fe.QuadraticFeatures(features=ic_norm_in+ic_norm_out)) print 'Applying Quadratic Transformation on normalized' self.merged_sf=quad_norm.transform(self.merged_sf) quad_unnorm= fe.create(self.merged_sf, fe.QuadraticFeatures(features=ic_unnorm_in+ic_unnorm_out)) print 'Applying Quadratic Transformation on unnormalized columns' self.merged_sf=quad_unnorm.transform(self.merged_sf) else: print 'Applying feature transformation in the case when the shape of the sf is low', self.merged_sf.shape feats=[each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each] quad_transform = fe.create(self.merged_sf, fe.QuadraticFeatures(features=feats)) self.merged_sf=quad_transform.transform(self.merged_sf) #self.merged_sf=self.normalize(self.merged_sf, self.ntype) print 'Preprocessing complete'
def preprocess(clusteral_features, ground_truth, output_file, clusteral_key_col, labels_key_col, labels_value_col, interaction=0, join_type='inner', encode=False): lf=gl.SFrame.read_csv(ground_truth)[[labels_key_col,labels_value_col]] print 'Shape of the labels file is ', lf.shape cf=gl.SFrame.read_csv(clusteral_features).rename({clusteral_key_col:labels_key_col}) print 'Shape of the Clusteral file is ', cf.shape cf=cf[[each for each in cf.column_names() if each not in ['old_cid','orig_cid','gend'] and 'alter' not in each] ] for col in cf.column_names(): cf=cf.fillna(col,0) merged_sf=lf.join(cf , on=labels_key_col, how=join_type).fillna(labels_value_col,0) print 'Shape of the merged file is ', merged_sf.shape print merged_sf[labels_value_col].sketch_summary() if encode: merged_sf[labels_value_col]=merged_sf[labels_value_col].apply(lambda x:bin_encode(x)) merged_sf[labels_value_col].head(2) print merged_sf[labels_value_col].sketch_summary() interaction_columns=[each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each] if DEBUG: print 'Interaction.column_names()' print interaction_columns #sys.exit(0) if interaction: quad = fe.create(merged_sf, fe.QuadraticFeatures(features=interaction_columns)) print 'Applying Quadratic Transformation' merged_sf=quad.transform(merged_sf) #print 'Flattening the quadratic features' #merged_sf=merged_sf.unpack('quadratic_features') return merged_sf
def preprocess(self, join_type='inner', encode=False, interaction=0): ground_truth = self.ground_truth labels_key_col = self.labels_key_col labels_value_col = self.labels_value_col clusteral_features = self.clusteral_features clusteral_key_col = self.clusteral_key_col lf = gl.SFrame.read_csv(ground_truth)[[ labels_key_col, labels_value_col ]] print 'Shape of the labels file is ', lf.shape cf = gl.SFrame.read_csv(clusteral_features).rename( {clusteral_key_col: labels_key_col}) print 'Shape of the Clusteral file is ', cf.shape cf = cf[[ each for each in cf.column_names() if each not in ['old_cid', 'orig_cid', 'gend'] and 'alter' not in each ]] print 'Filling NAs' for col in cf.column_names(): cf = cf.fillna(col, gl.Sketch(cf[col]).quantile(0.5)) #cf=self.normalize(cf,self.ntype) self.merged_sf = lf.join(cf, on=labels_key_col, how=join_type).fillna(labels_value_col, 0) print 'Shape of the merged file is ', self.merged_sf.shape print self.merged_sf[labels_value_col].sketch_summary() if encode: self.merged_sf[labels_value_col] = self.merged_sf[ labels_value_col].apply(lambda x: self.util.bin_encode(x)) self.merged_sf[labels_value_col].head(2) print self.merged_sf[labels_value_col].sketch_summary() if interaction: if self.baseline == 0: regexp = re.compile('\d{2,}') # Filtering out the spectal features with two digits in it. As the spectral digits are in reverse order of importance, so first of all I want to filter the feature with two or more digits then I want to reverse sort them ic_norm = [ each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each and 'norm' in each ] ic_norm_in = sorted([ each for each in ic_norm if 'in' in each and re.search(regexp, each) ], reverse=True)[:5] ic_norm_out = sorted([ each for each in ic_norm if 'out' in each and re.search(regexp, each) ], reverse=True)[:5] ic_unnorm = [ each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each and 'norm' not in each ] ic_unnorm_in = sorted([ each for each in ic_unnorm if 'in' in each and re.search(regexp, each) ], reverse=True)[:5] ic_unnorm_out = sorted([ each for each in ic_unnorm if 'out' in each and re.search(regexp, each) ], reverse=True)[:5] quad_norm = fe.create( self.merged_sf, fe.QuadraticFeatures(features=ic_norm_in + ic_norm_out)) print 'Applying Quadratic Transformation on normalized' self.merged_sf = quad_norm.transform(self.merged_sf) quad_unnorm = fe.create( self.merged_sf, fe.QuadraticFeatures(features=ic_unnorm_in + ic_unnorm_out)) print 'Applying Quadratic Transformation on unnormalized columns' self.merged_sf = quad_unnorm.transform(self.merged_sf) else: print 'Applying feature transformation in the case when the shape of the sf is low', self.merged_sf.shape feats = [ each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each ] quad_transform = fe.create( self.merged_sf, fe.QuadraticFeatures(features=feats)) self.merged_sf = quad_transform.transform(self.merged_sf) #self.merged_sf=self.normalize(self.merged_sf, self.ntype) print 'Preprocessing complete'
def create_onehot_features(train, interaction_columns, categories=300, label='encoded_features'): return fe.create(train, fe.OneHotEncoder(features=interaction_columns, max_categories=categories, output_column_name=label)), label
def create_quad_features(train, interaction_columns, label='quadratic_features'): return fe.create(train, fe.QuadraticFeatures(features=interaction_columns, output_column_name=label)), label