def _generate_metadata_from_directory(datasetDirectory, targetField='class', checkSubdirectories=True, fileExtensions=['.csv', '.tsv']): """Extract metafeatures for all dataset files in the directory :returns dict: dataset name(str):metafeatures(dataFrame) """ logger.info( f"generating metafeatures for files in directory '{datasetDirectory}', targetField={targetField}, checkSubdirectories={checkSubdirectories}, fileExtensions={fileExtensions}" ) if (not datasetDirectory): raise ValueError( "Could not generate metadata from directory, 'datasetDirectory' must be specified" ) metafeaturesData = {} for root, dirs, files in os.walk(datasetDirectory): for name in files: extension = os.path.splitext(name)[1] if not name.startswith('.') and (extension in fileExtensions): # split twice to handle double extensions, i.e. # 'myfile.tsv.gz' => 'myfile' dataset = os.path.splitext(os.path.splitext(name)[0])[0] datapath = os.path.join(root, name) logger.debug(f"Generating metadata for {datapath}") metafeatures = mf.generate_metafeatures_from_filepath( datapath, targetField) metafeaturesData[dataset] = metafeatures return metafeaturesData
def _generate_metadata_from_directory(datasetDirectory, targetField='class', checkSubdirectories=True): """Extract metafeatures for all .csv files in the directory :returns dict: dataset name(str):metafeatures(dataFrame) """ metafeaturesData = {} for root, dirs, files in os.walk(datasetDirectory): for name in files: if not name.startswith('.') and name.endswith('.csv'): dataset = os.path.splitext(name)[0] logger.debug("Generating metadata for {}".format( os.path.join(root, name))) metafeatures = mf.generate_metafeatures_from_filepath( os.path.join(root, name), targetField) metafeaturesData[dataset] = metafeatures return metafeaturesData
def test_generate_metafeatures_from_filepath(self): result = mf.generate_metafeatures_from_filepath( self.irisPath, self.irisTarget) self.assertEquals(set(result.keys()), set(self.expectedMetafeatureKeys)) self.assertEquals(result[self.depColMetafeature], self.irisTarget)
# load pmlb data pmlb_data = pd.read_csv(args.pmlb_file, compression='gzip', sep='\t').fillna('') print('datset cols:', pmlb_data.columns) for dataset, dfg in pmlb_data.groupby('dataset'): # print(args.data_dir+'/'+dataset+'/*'+args.tail) # dataset_path = args.data_dir+dataset+'/'+dataset+args.tail # dataset_path = args.data_dir+'/'+dataset+args.tail # dataset_path = glob(args.data_dir+'/'+dataset+'/*'+args.tail) if args.flat: dataset_path = glob(args.data_dir + '/*' + dataset + '*' + args.tail) else: dataset_path = glob(args.data_dir + '/' + dataset + '/*' + args.tail) print('dataset_path:', dataset_path) assert (len(dataset_path) == 1) dataset_path = dataset_path[0] print(dataset_path) mf = generate_metafeatures_from_filepath(dataset_path, args.label, compression=compression) meta_json = simplejson.dumps(mf, ignore_nan=True) #, ensure_ascii=False) savepath = args.savedir + dataset if not os.path.exists(savepath): os.makedirs(savepath) out_file = savepath + '/metafeatures.json' with open(out_file, 'w') as out: out.write(meta_json)