def sample_data(X, y): if sourceType == SourceType.age: y = LabelEncoder().fit_transform( pd.cut(y, bins, labels=range(len(bins) - 1))) if sample_type == SampleType.under: X, y = under_sample(X, y) elif sample_type == SampleType.over: X, y = over_sample(X, y) else: if sourceType != SourceType.age: y = y.reshape(-1, 1) return X, y
def split_train_test(dataSet): cut = -1 if sourceType == SourceType.race else -2 X, y = (dataSet[:, :cut], dataSet[:, cut+1]) if sourceType == SourceType.age else (dataSet[:, :cut], dataSet[:, cut]) #if processType == ProcessType.name_c_tbn or processType == ProcessType.name or processType == ProcessType.tbn_c_name_att: #idx = np.array([[num for num in range(len(X))]]) #X = np.concatenate((X, idx.T), axis=1) #X = X[:, 1:] y = y.astype('int') #y = np.array([int(val) for val in y]) if sourceType == SourceType.age: y = LabelEncoder().fit_transform(pd.cut(y, bins, labels=range(len(bins)-1))) if sample_type == SampleType.under: X, y = under_sample(X, y) elif sample_type == SampleType.over: X, y = over_sample(X, y) else: if sourceType != SourceType.age: y = y.reshape(-1, 1) return X, y
remainHandles = get_remain_handles( "/home/yaguang/new_nonstop_onefeaturesword1.csv") onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] fix_seq_len = 200 batch_size = 32 rocs = [] #emoji processing word2idx = read_word2idx() pretrained_embedding = load_pretrained_vectors(word2idx) index_to_file_label = map_index_to_file_label(onlyfiles) index, labels = get_index_label(index_to_file_label) usecols = list(np.arange(1, 769)) counter = [0, 0, 0, 0] index, labels = under_sample(index.reshape(-1, 1), labels) index = index.ravel() for label in labels: counter[label] += 1 w = open("metric_result.txt", 'w') skf = StratifiedKFold(n_splits=10, shuffle=True) learning_rate = 0.0001 counter = [0, 0, 0, 0] for idx in index: counter[index_to_file_label[idx][1]] += 1 print(counter) for train_index, test_index in skf.split(index, labels):