Beispiel #1
0
 def split_dataset(seqs_info, split_options):
     """splits dataset for learning and testing
     """
     if split_options["method"] == "wsample":
         # try weighted sample
         # first group seqs based on length
         grouped_seqs = group_seqs_by_length(seqs_info)
         # second get a weighted sample based on seqs length
         w_sample = weighted_sample(
             grouped_seqs, trainset_size=split_options["trainset_size"]
         )
         # print("w_sample ", w_sample)
         # third aggregate the seqs in training category and testing category
         data_split = aggregate_weightedsample(w_sample)
     elif split_options["method"] == "cross_validation":
         # try cross validation
         seqs_id = list(seqs_info.keys())
         data_split = split_data(seqs_id, split_options)
     elif split_options["method"] == "random":
         seqs_id = list(seqs_info.keys())
         data_split = split_data(seqs_id, split_options)
     elif split_options["method"] == "none":
         seqs_id = list(seqs_info.keys())
         data_split = {0: {"train": seqs_id}}
     return data_split
Beispiel #2
0
 def split_dataset(self, seqs_info, split_options):
     if(split_options['method'] == "wsample"):
         # try weighted sample
         # first group seqs based on length
         grouped_seqs = group_seqs_by_length(seqs_info)
         # second get a weighted sample based on seqs length
         w_sample = weighted_sample(grouped_seqs, trainset_size=split_options['trainset_size'])
         print("w_sample ", w_sample)
         # third aggregate the seqs in training category and testing category
         data_split = aggregate_weightedsample(w_sample)
     elif(split_options['method'] == "cross_validation"):
         # try cross validation
         seqs_id = list(seqs_info.keys())
         data_split = split_data(seqs_id, split_options)
     elif(split_options['method'] == 'random'):
         seqs_id = list(seqs_info.keys())
         data_split = split_data(seqs_id, split_options)
     elif(split_options['method'] == 'none'):
         seqs_id = list(seqs_info.keys())
         data_split = {0:{'train':seqs_id}}    
     return(data_split)