Beispiel #1
0
def train_test_split_none(*arrays, **kwargs):
    """extend train_test_split to take None arrays
    and support split by group names.
    """
    nones = []
    new_arrays = []
    for idx, arr in enumerate(arrays):
        if arr is None:
            nones.append(idx)
        else:
            new_arrays.append(arr)

    if kwargs["shuffle"] == "None":
        kwargs["shuffle"] = None

    group_names = kwargs.pop("group_names", None)

    if group_names is not None and group_names.strip():
        group_names = [name.strip() for name in group_names.split(",")]
        new_arrays = indexable(*new_arrays)
        groups = kwargs["labels"]
        n_samples = new_arrays[0].shape[0]
        index_arr = np.arange(n_samples)
        test = index_arr[np.isin(groups, group_names)]
        train = index_arr[~np.isin(groups, group_names)]
        rval = list(
            chain.from_iterable(
                (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays
            )
        )
    else:
        rval = train_test_split(*new_arrays, **kwargs)

    for pos in nones:
        rval[pos * 2: 2] = [None, None]

    return rval
Beispiel #2
0
def main(inputs,
         infile_array,
         outfile_train,
         outfile_test,
         infile_labels=None,
         infile_groups=None):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_array : str
        File paths of input arrays separated by comma

    infile_labels : str
        File path to dataset containing labels

    infile_groups : str
        File path to dataset containing groups

    outfile_train : str
        File path to dataset containing train split

    outfile_test : str
        File path to dataset containing test split
    """
    warnings.simplefilter('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)

    input_header = params['header0']
    header = 'infer' if input_header else None
    array = pd.read_csv(infile_array,
                        sep='\t',
                        header=header,
                        parse_dates=True)

    # train test split
    if params['mode_selection']['selected_mode'] == 'train_test_split':
        options = params['mode_selection']['options']
        shuffle_selection = options.pop('shuffle_selection')
        options['shuffle'] = shuffle_selection['shuffle']
        if infile_labels:
            header = 'infer' if shuffle_selection['header1'] else None
            col_index = shuffle_selection['col'][0] - 1
            df = pd.read_csv(infile_labels,
                             sep='\t',
                             header=header,
                             parse_dates=True)
            labels = df.iloc[:, col_index].values
            options['labels'] = labels

        train, test = train_test_split(array, **options)

    # cv splitter
    else:
        train, test = _get_single_cv_split(params,
                                           array,
                                           infile_labels=infile_labels,
                                           infile_groups=infile_groups)

    print("Input shape: %s" % repr(array.shape))
    print("Train shape: %s" % repr(train.shape))
    print("Test shape: %s" % repr(test.shape))
    train.to_csv(outfile_train, sep='\t', header=input_header, index=False)
    test.to_csv(outfile_test, sep='\t', header=input_header, index=False)
Beispiel #3
0
def main(
    inputs,
    infile_array,
    outfile_train,
    outfile_test,
    infile_labels=None,
    infile_groups=None,
):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_array : str
        File paths of input arrays separated by comma

    infile_labels : str
        File path to dataset containing labels

    infile_groups : str
        File path to dataset containing groups

    outfile_train : str
        File path to dataset containing train split

    outfile_test : str
        File path to dataset containing test split
    """
    warnings.simplefilter("ignore")

    with open(inputs, "r") as param_handler:
        params = json.load(param_handler)

    input_header = params["header0"]
    header = "infer" if input_header else None
    array = pd.read_csv(infile_array,
                        sep="\t",
                        header=header,
                        parse_dates=True)

    # train test split
    if params["mode_selection"]["selected_mode"] == "train_test_split":
        options = params["mode_selection"]["options"]
        shuffle_selection = options.pop("shuffle_selection")
        options["shuffle"] = shuffle_selection["shuffle"]
        if infile_labels:
            header = "infer" if shuffle_selection["header1"] else None
            col_index = shuffle_selection["col"][0] - 1
            df = pd.read_csv(infile_labels,
                             sep="\t",
                             header=header,
                             parse_dates=True)
            labels = df.iloc[:, col_index].values
            options["labels"] = labels

        train, test = train_test_split(array, **options)

    # cv splitter
    else:
        train, test = _get_single_cv_split(params,
                                           array,
                                           infile_labels=infile_labels,
                                           infile_groups=infile_groups)

    print("Input shape: %s" % repr(array.shape))
    print("Train shape: %s" % repr(train.shape))
    print("Test shape: %s" % repr(test.shape))
    train.to_csv(outfile_train, sep="\t", header=input_header, index=False)
    test.to_csv(outfile_test, sep="\t", header=input_header, index=False)