def split_dataset(seqs_info, split_options): """splits dataset for learning and testing """ if split_options["method"] == "wsample": # try weighted sample # first group seqs based on length grouped_seqs = group_seqs_by_length(seqs_info) # second get a weighted sample based on seqs length w_sample = weighted_sample( grouped_seqs, trainset_size=split_options["trainset_size"] ) # print("w_sample ", w_sample) # third aggregate the seqs in training category and testing category data_split = aggregate_weightedsample(w_sample) elif split_options["method"] == "cross_validation": # try cross validation seqs_id = list(seqs_info.keys()) data_split = split_data(seqs_id, split_options) elif split_options["method"] == "random": seqs_id = list(seqs_info.keys()) data_split = split_data(seqs_id, split_options) elif split_options["method"] == "none": seqs_id = list(seqs_info.keys()) data_split = {0: {"train": seqs_id}} return data_split
def split_dataset(self, seqs_info, split_options): if(split_options['method'] == "wsample"): # try weighted sample # first group seqs based on length grouped_seqs = group_seqs_by_length(seqs_info) # second get a weighted sample based on seqs length w_sample = weighted_sample(grouped_seqs, trainset_size=split_options['trainset_size']) print("w_sample ", w_sample) # third aggregate the seqs in training category and testing category data_split = aggregate_weightedsample(w_sample) elif(split_options['method'] == "cross_validation"): # try cross validation seqs_id = list(seqs_info.keys()) data_split = split_data(seqs_id, split_options) elif(split_options['method'] == 'random'): seqs_id = list(seqs_info.keys()) data_split = split_data(seqs_id, split_options) elif(split_options['method'] == 'none'): seqs_id = list(seqs_info.keys()) data_split = {0:{'train':seqs_id}} return(data_split)