Ejemplo n.º 1
0
def convert(raw_dir, max_features):
    dataset_dict =  util.convert_uci_classif( info, raw_dir, file_name, y_first=True )
    
    # Feature 15 is constant. Thus useless.
    # Feature 4 is the most important feature (according to random forest). 
    # But with it, the problem is too easy i.e., most decent learning algorithms classify the test perfectly.
    return util.remove_features( dataset_dict, [4,15] ) 
Ejemplo n.º 2
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """
    return util.convert_uci_classif(info,
                                    raw_dir,
                                    file_name_list,
                                    delimiter=' ')
Ejemplo n.º 3
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """
    return util.convert_uci_classif(info,
                                    raw_dir,
                                    file_name,
                                    converters={0: util.convert_date})
Ejemplo n.º 4
0
def convert(raw_dir, max_features):
    util.untar(raw_dir, file_name)

    file_name_list = ["census-income.data", "census-income.data"]

    # We remove feature 24 (instance weight).
    columns = range(24) + range(25, 42)
    return util.convert_uci_classif(info, raw_dir, file_name_list, delimiter=", ", usecols=columns)
Ejemplo n.º 5
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """
    
    info['x'], info['y'] = util.convert_uci_classif(
        info['x_type'], info['y_type'], raw_dir,'anneal.data') 
    
    return info
Ejemplo n.º 6
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """
    data_dir = os.path.join( raw_dir, 'ml-prove' )
    file_name_list = ['test.csv', 'validation.csv', 'train.csv' ]
    dataset_dict =  util.convert_uci_classif( info, data_dir, file_name_list )
    
    return util.remove_features(dataset_dict, [51,53])
Ejemplo n.º 7
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """
    data_dir = os.path.join(raw_dir, 'ml-prove')
    file_name_list = ['test.csv', 'validation.csv', 'train.csv']
    dataset_dict = util.convert_uci_classif(info, data_dir, file_name_list)

    return util.remove_features(dataset_dict, [51, 53])
Ejemplo n.º 8
0
def convert(raw_dir, max_features):
    util.unzip(raw_dir, file_name)
    data_dir = os.path.join(os.path.join(raw_dir, "PAMAP2_Dataset"), "Protocol")

    file_name_list = ["subject10%d.dat" % x for x in range(1, 10)]

    # We remove features 0 (timestamp), 16-19, 33-36 and 50-53 (invalid).
    columns = range(1, 16) + range(20, 33) + range(37, 50)
    return util.convert_uci_classif(info, data_dir, file_name_list, stride=4, delimiter=" ", y_first=True, usecols=columns)
Ejemplo n.º 9
0
def convert(raw_dir, max_features):
    dataset_dict = util.convert_uci_classif(info,
                                            raw_dir,
                                            file_name,
                                            y_first=True)

    # Feature 15 is constant. Thus useless.
    # Feature 4 is the most important feature (according to random forest).
    # But with it, the problem is too easy i.e., most decent learning algorithms classify the test perfectly.
    return util.remove_features(dataset_dict, [4, 15])
Ejemplo n.º 10
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """
    columns = range(1, 32)
    return util.convert_uci_classif(info,
                                    raw_dir,
                                    file_name,
                                    y_first=True,
                                    usecols=columns)
Ejemplo n.º 11
0
def convert(raw_dir, max_features):
    
    # extract only one line out of 10 since the dataset fails to load in memory (using numpy.loadtxt with str data type)
    file_name_ = "kddcup_sub.data"
    with gzip.open( path.join(raw_dir, file_name ), 'r') as fd_read:
        with open(  path.join(raw_dir, file_name_), 'w') as fd_write:
            for i,line in enumerate(fd_read): 
                if i%10 == 0:
                    fd_write.write( line )
    
    return util.convert_uci_classif(info, raw_dir, file_name_)
Ejemplo n.º 12
0
def convert(raw_dir, max_features):
    util.untar(raw_dir, file_name)

    file_name_list = ["census-income.data", "census-income.data"]

    # We remove feature 24 (instance weight).
    columns = range(24) + range(25, 42)
    return util.convert_uci_classif(info,
                                    raw_dir,
                                    file_name_list,
                                    delimiter=", ",
                                    usecols=columns)
Ejemplo n.º 13
0
def convert(raw_dir, max_features):
    util.unzip(raw_dir, file_name)
    data_dir = os.path.join(os.path.join(raw_dir, "PAMAP2_Dataset"),
                            "Protocol")

    file_name_list = ["subject10%d.dat" % x for x in range(1, 10)]

    # We remove features 0 (timestamp), 16-19, 33-36 and 50-53 (invalid).
    columns = range(1, 16) + range(20, 33) + range(37, 50)
    return util.convert_uci_classif(info,
                                    data_dir,
                                    file_name_list,
                                    stride=4,
                                    delimiter=" ",
                                    y_first=True,
                                    usecols=columns)
Ejemplo n.º 14
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """

    # In the data file, one instance is splitted over two lines.
    # The following code write a fixed file. 
    fixed_file_name = file_name + '2'    
    with open(path.join(raw_dir,file_name)) as f_in:
        with open(path.join(raw_dir,fixed_file_name),'w') as f_out:          
            for line in f_in.readlines():
                line = line.strip()              
                f_out.write(line)
                if len(line) > 0 and line[-1] != ',':
                    f_out.write('\n')
                
    return util.convert_uci_classif(info, raw_dir, fixed_file_name)
Ejemplo n.º 15
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """

    # In the data file, one instance is splitted over two lines.
    # The following code write a fixed file.
    fixed_file_name = file_name + '2'
    with open(path.join(raw_dir, file_name)) as f_in:
        with open(path.join(raw_dir, fixed_file_name), 'w') as f_out:
            for line in f_in.readlines():
                line = line.strip()
                f_out.write(line)
                if len(line) > 0 and line[-1] != ',':
                    f_out.write('\n')

    return util.convert_uci_classif(info, raw_dir, fixed_file_name)
Ejemplo n.º 16
0
def convert(raw_dir, max_features):
    return util.convert_uci_classif(info, raw_dir, file_name)
Ejemplo n.º 17
0
def convert(raw_dir, max_features):
    return util.convert_uci_classif(info, raw_dir, file_name)
Ejemplo n.º 18
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """
    return util.convert_uci_classif( info, raw_dir, file_name )
Ejemplo n.º 19
0
def convert(raw_dir, max_features):
    """
    returns a dictionary containing the required fields for the dataset.
    """
    columns = range(1,32)
    return util.convert_uci_classif( info, raw_dir, file_name, y_first=True, usecols=columns )