Beispiel #1
0
def _generate_metadata_from_directory(datasetDirectory,
                                      targetField='class',
                                      checkSubdirectories=True,
                                      fileExtensions=['.csv', '.tsv']):
    """Extract metafeatures for all dataset files in the directory

    :returns dict: dataset name(str):metafeatures(dataFrame)
    """
    logger.info(
        f"generating metafeatures for files in directory '{datasetDirectory}', targetField={targetField}, checkSubdirectories={checkSubdirectories}, fileExtensions={fileExtensions}"
    )

    if (not datasetDirectory):
        raise ValueError(
            "Could not generate metadata from directory, 'datasetDirectory' must be specified"
        )

    metafeaturesData = {}

    for root, dirs, files in os.walk(datasetDirectory):
        for name in files:
            extension = os.path.splitext(name)[1]
            if not name.startswith('.') and (extension in fileExtensions):
                # split twice to handle double extensions, i.e.
                #   'myfile.tsv.gz' => 'myfile'
                dataset = os.path.splitext(os.path.splitext(name)[0])[0]
                datapath = os.path.join(root, name)
                logger.debug(f"Generating metadata for {datapath}")
                metafeatures = mf.generate_metafeatures_from_filepath(
                    datapath, targetField)
                metafeaturesData[dataset] = metafeatures

    return metafeaturesData
Beispiel #2
0
def _generate_metadata_from_directory(datasetDirectory,
                                      targetField='class',
                                      checkSubdirectories=True):
    """Extract metafeatures for all .csv files in the directory

    :returns dict: dataset name(str):metafeatures(dataFrame)
    """
    metafeaturesData = {}

    for root, dirs, files in os.walk(datasetDirectory):
        for name in files:
            if not name.startswith('.') and name.endswith('.csv'):
                dataset = os.path.splitext(name)[0]
                logger.debug("Generating metadata for {}".format(
                    os.path.join(root, name)))
                metafeatures = mf.generate_metafeatures_from_filepath(
                    os.path.join(root, name), targetField)
                metafeaturesData[dataset] = metafeatures

    return metafeaturesData
Beispiel #3
0
 def test_generate_metafeatures_from_filepath(self):
     result = mf.generate_metafeatures_from_filepath(
         self.irisPath, self.irisTarget)
     self.assertEquals(set(result.keys()),
                       set(self.expectedMetafeatureKeys))
     self.assertEquals(result[self.depColMetafeature], self.irisTarget)
    # load pmlb data
    pmlb_data = pd.read_csv(args.pmlb_file, compression='gzip',
                            sep='\t').fillna('')
    print('datset cols:', pmlb_data.columns)
    for dataset, dfg in pmlb_data.groupby('dataset'):
        # print(args.data_dir+'/'+dataset+'/*'+args.tail)
        # dataset_path = args.data_dir+dataset+'/'+dataset+args.tail
        # dataset_path = args.data_dir+'/'+dataset+args.tail
        # dataset_path = glob(args.data_dir+'/'+dataset+'/*'+args.tail)
        if args.flat:
            dataset_path = glob(args.data_dir + '/*' + dataset + '*' +
                                args.tail)
        else:
            dataset_path = glob(args.data_dir + '/' + dataset + '/*' +
                                args.tail)
        print('dataset_path:', dataset_path)
        assert (len(dataset_path) == 1)
        dataset_path = dataset_path[0]
        print(dataset_path)
        mf = generate_metafeatures_from_filepath(dataset_path,
                                                 args.label,
                                                 compression=compression)
        meta_json = simplejson.dumps(mf,
                                     ignore_nan=True)  #, ensure_ascii=False)
        savepath = args.savedir + dataset
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        out_file = savepath + '/metafeatures.json'
        with open(out_file, 'w') as out:
            out.write(meta_json)