def preprocess(self, join_type='inner', encode=False, interaction=0):
		ground_truth=self.ground_truth
		labels_key_col=self.labels_key_col
		labels_value_col=self.labels_value_col
		clusteral_features=self.clusteral_features
		clusteral_key_col=self.clusteral_key_col		

		lf=gl.SFrame.read_csv(ground_truth)[[labels_key_col,labels_value_col]]
		print 'Shape of the labels file is ', lf.shape
		cf=gl.SFrame.read_csv(clusteral_features).rename({clusteral_key_col:labels_key_col})
		print 'Shape of the Clusteral file is ', cf.shape

		cf=cf[[each for each in cf.column_names() if each not in ['old_cid','orig_cid','gend'] and 'alter' not in each] ]
	
		print 'Filling NAs'	
		for col in cf.column_names():
			
			cf=cf.fillna(col,gl.Sketch(cf[col]).quantile(0.5))
	
	
		#cf=self.normalize(cf,self.ntype)
		self.merged_sf=lf.join(cf , on=labels_key_col, how=join_type).fillna(labels_value_col,0)
		print 'Shape of the merged file is ', self.merged_sf.shape
		print self.merged_sf[labels_value_col].sketch_summary()
		
		if encode:
			self.merged_sf[labels_value_col]=self.merged_sf[labels_value_col].apply(lambda x:self.util.bin_encode(x))
			self.merged_sf[labels_value_col].head(2)

		print self.merged_sf[labels_value_col].sketch_summary()

		if interaction:
			if self.baseline==0:
				regexp=re.compile('\d{2,}')
				# Filtering out the spectal features with two digits in it. As the spectral digits are in reverse order of importance, so first of all I want to filter the feature with two or more digits then I want to reverse sort them	
				ic_norm=[each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each and 'norm' in each]
				ic_norm_in=sorted([each for each in ic_norm  if 'in' in each and re.search(regexp,each)], reverse=True)[:5]
				ic_norm_out=sorted([each for each in ic_norm if 'out' in each and re.search(regexp,each)], reverse=True)[:5]
				
				ic_unnorm=[each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each and 'norm' not in each]
				ic_unnorm_in=sorted([each for each in ic_unnorm  if 'in' in each and re.search(regexp,each)], reverse=True)[:5]
				ic_unnorm_out=sorted([each for each in ic_unnorm if 'out' in each and re.search(regexp,each)],reverse=True)[:5]
				
				quad_norm = fe.create(self.merged_sf, fe.QuadraticFeatures(features=ic_norm_in+ic_norm_out))	
				print 'Applying Quadratic Transformation on normalized'
				self.merged_sf=quad_norm.transform(self.merged_sf)
				quad_unnorm= fe.create(self.merged_sf, fe.QuadraticFeatures(features=ic_unnorm_in+ic_unnorm_out))
				print 'Applying Quadratic Transformation on unnormalized columns'
				self.merged_sf=quad_unnorm.transform(self.merged_sf)
			else:
				print 'Applying feature transformation in the case when the shape of the sf is low', self.merged_sf.shape
				feats=[each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not     in each and labels_value_col not in each]
				quad_transform = fe.create(self.merged_sf, fe.QuadraticFeatures(features=feats))
				self.merged_sf=quad_transform.transform(self.merged_sf)
		#self.merged_sf=self.normalize(self.merged_sf, self.ntype)
		print 'Preprocessing complete'
Example #2
0
def preprocess(clusteral_features, ground_truth, output_file, clusteral_key_col, labels_key_col, labels_value_col, interaction=0, join_type='inner', encode=False):
	lf=gl.SFrame.read_csv(ground_truth)[[labels_key_col,labels_value_col]]
	print 'Shape of the labels file is ', lf.shape
	cf=gl.SFrame.read_csv(clusteral_features).rename({clusteral_key_col:labels_key_col})
	print 'Shape of the Clusteral file is ', cf.shape

	cf=cf[[each for each in cf.column_names() if each not in ['old_cid','orig_cid','gend'] and 'alter' not in each] ]
	
	for col in cf.column_names():
		cf=cf.fillna(col,0)
	
	merged_sf=lf.join(cf , on=labels_key_col, how=join_type).fillna(labels_value_col,0)
	print 'Shape of the merged file is ', merged_sf.shape
	print merged_sf[labels_value_col].sketch_summary()
	
	if encode:
		merged_sf[labels_value_col]=merged_sf[labels_value_col].apply(lambda x:bin_encode(x))
		merged_sf[labels_value_col].head(2)

	print merged_sf[labels_value_col].sketch_summary()

	interaction_columns=[each for each in cf.column_names() if labels_key_col not in each and clusteral_key_col not in each and labels_value_col not in each]
	if DEBUG:
		print 'Interaction.column_names()'
		print interaction_columns
		#sys.exit(0)
	if interaction:
		quad = fe.create(merged_sf, fe.QuadraticFeatures(features=interaction_columns))	
		print 'Applying Quadratic Transformation'
		merged_sf=quad.transform(merged_sf)
		#print 'Flattening the quadratic features'
		#merged_sf=merged_sf.unpack('quadratic_features')
	
	return merged_sf
Example #3
0
    def preprocess(self, join_type='inner', encode=False, interaction=0):
        ground_truth = self.ground_truth
        labels_key_col = self.labels_key_col
        labels_value_col = self.labels_value_col
        clusteral_features = self.clusteral_features
        clusteral_key_col = self.clusteral_key_col

        lf = gl.SFrame.read_csv(ground_truth)[[
            labels_key_col, labels_value_col
        ]]
        print 'Shape of the labels file is ', lf.shape
        cf = gl.SFrame.read_csv(clusteral_features).rename(
            {clusteral_key_col: labels_key_col})
        print 'Shape of the Clusteral file is ', cf.shape

        cf = cf[[
            each for each in cf.column_names() if
            each not in ['old_cid', 'orig_cid', 'gend'] and 'alter' not in each
        ]]

        print 'Filling NAs'
        for col in cf.column_names():

            cf = cf.fillna(col, gl.Sketch(cf[col]).quantile(0.5))

        #cf=self.normalize(cf,self.ntype)
        self.merged_sf = lf.join(cf, on=labels_key_col,
                                 how=join_type).fillna(labels_value_col, 0)
        print 'Shape of the merged file is ', self.merged_sf.shape
        print self.merged_sf[labels_value_col].sketch_summary()

        if encode:
            self.merged_sf[labels_value_col] = self.merged_sf[
                labels_value_col].apply(lambda x: self.util.bin_encode(x))
            self.merged_sf[labels_value_col].head(2)

        print self.merged_sf[labels_value_col].sketch_summary()

        if interaction:
            if self.baseline == 0:
                regexp = re.compile('\d{2,}')
                # Filtering out the spectal features with two digits in it. As the spectral digits are in reverse order of importance, so first of all I want to filter the feature with two or more digits then I want to reverse sort them
                ic_norm = [
                    each for each in cf.column_names()
                    if labels_key_col not in each and clusteral_key_col not in
                    each and labels_value_col not in each and 'norm' in each
                ]
                ic_norm_in = sorted([
                    each for each in ic_norm
                    if 'in' in each and re.search(regexp, each)
                ],
                                    reverse=True)[:5]
                ic_norm_out = sorted([
                    each for each in ic_norm
                    if 'out' in each and re.search(regexp, each)
                ],
                                     reverse=True)[:5]

                ic_unnorm = [
                    each for each in cf.column_names()
                    if labels_key_col not in each
                    and clusteral_key_col not in each
                    and labels_value_col not in each and 'norm' not in each
                ]
                ic_unnorm_in = sorted([
                    each for each in ic_unnorm
                    if 'in' in each and re.search(regexp, each)
                ],
                                      reverse=True)[:5]
                ic_unnorm_out = sorted([
                    each for each in ic_unnorm
                    if 'out' in each and re.search(regexp, each)
                ],
                                       reverse=True)[:5]

                quad_norm = fe.create(
                    self.merged_sf,
                    fe.QuadraticFeatures(features=ic_norm_in + ic_norm_out))
                print 'Applying Quadratic Transformation on normalized'
                self.merged_sf = quad_norm.transform(self.merged_sf)
                quad_unnorm = fe.create(
                    self.merged_sf,
                    fe.QuadraticFeatures(features=ic_unnorm_in +
                                         ic_unnorm_out))
                print 'Applying Quadratic Transformation on unnormalized columns'
                self.merged_sf = quad_unnorm.transform(self.merged_sf)
            else:
                print 'Applying feature transformation in the case when the shape of the sf is low', self.merged_sf.shape
                feats = [
                    each for each in cf.column_names()
                    if labels_key_col not in each and clusteral_key_col not in
                    each and labels_value_col not in each
                ]
                quad_transform = fe.create(
                    self.merged_sf, fe.QuadraticFeatures(features=feats))
                self.merged_sf = quad_transform.transform(self.merged_sf)
        #self.merged_sf=self.normalize(self.merged_sf, self.ntype)
        print 'Preprocessing complete'
Example #4
0
def create_onehot_features(train, interaction_columns, categories=300, label='encoded_features'):
    return fe.create(train, fe.OneHotEncoder(features=interaction_columns,
                                             max_categories=categories, output_column_name=label)), label
Example #5
0
def create_quad_features(train, interaction_columns, label='quadratic_features'):
    return fe.create(train, fe.QuadraticFeatures(features=interaction_columns, output_column_name=label)), label