def get_features(instances, num_processes): logging.info("Creating pipe") fgs = [JaccardFeatureGenerator(), InfixFeatureGenerator()] pipe = Pipe(fgs, instances, num_processes=num_processes) pipe.push_all_parallel() # group by earmark fgs = [ RankingFeatureGenerator( feature_group="JACCARD_FG", feature="JACCARD_FG_max_inferred_name_jaccard", prefix="G1_" ), RankingFeatureGenerator(feature_group="JACCARD_FG", feature="JACCARD_FG_max_cell_jaccard", prefix="G1_"), ] grouper = InstancesGrouper(["earmark_id"]) pipe = BlocksPipe(grouper, fgs, pipe.instances, num_processes=num_processes) pipe.push_all_parallel() return pipe.instances
def serialize_student_group(students, data_folder): instances = [] for i in range(len(students)): for j in range(i+1, len(students), 1): instances.append(get_instance(students[i], students[j])) logging.info("Created %d instances" %(len(instances))) if len(instances) == 0: logging.warn("FAILED TO GENERATE INSTANCES!") return fgs = [IsSameFeatureGenerator(fields=['ZipCode', 'Gender', 'Language', 'HomeLanguage' ,'BirthCountry', 'Race', 'Food', 'ESL', 'LEP', 'SpecialED','CatchmentSchool', 'ThisGradeSchoolKey']), AbsoluteDifferenceFeatureGenerator(fields=['GPA', 'EighthMathISAT', 'EighthReadingISAT', 'AttendanceRate']), DistanceFeatureGenerator(), OtherFeaturesFeatureGenerator(), ] pipe = pipe = Pipe(fgs, instances, num_processes=1) pipe.push_all_parallel() serialize_instances(pipe.instances, data_folder) """
def get_features(instances, num_processes): logging.info("Creating pipe") fgs = [JaccardFeatureGenerator(), InfixFeatureGenerator()] pipe = Pipe(fgs, instances, num_processes=num_processes) pipe.push_all_parallel() #group by earmark fgs = [ RankingFeatureGenerator(feature_group="JACCARD_FG", feature="JACCARD_FG_max_inferred_name_jaccard", prefix='G1_'), RankingFeatureGenerator(feature_group="JACCARD_FG", feature="JACCARD_FG_max_cell_jaccard", prefix='G1_') ] grouper = InstancesGrouper(['earmark_id']) pipe = BlocksPipe(grouper, fgs, pipe.instances, num_processes=num_processes) pipe.push_all_parallel() return pipe.instances