def sentence2Index(dataPath, vocabDict, maxLen=100, lowercase=True): """ :param dataPath: path of data file :param vocabDict: vocabulary dict {word : index} :param maxLen: max length of sentence, if a sentence longer than maxLen, cut off it :param lowercase: boolean, lower words or not :return: s1Pad: padded sentence1 s2Pad: padded sentence2 s1Mask: actual length of sentence1 s2Mask: actual length of sentence2 """ s1List, s2List, labelList = [], [], [] s1Mask, s2Mask = [], [] with open(dataPath, mode='r', encoding='utf-8') as f: for line in f: try: l, s1, s2 = [v.strip() for v in line.strip().split('||')] if lowercase: s1, s2 = s1.lower(), s2.lower() s1 = [v.strip() for v in s1.split()] s2 = [v.strip() for v in s2.split()] if len(s1) > maxLen: s1 = s1[:maxLen] if len(s2) > maxLen: s2 = s2[:maxLen] if l in CATEGORIE_ID: labelList.append([CATEGORIE_ID[l]]) s1List.append([ vocabDict[word] if word in vocabDict else vocabDict[UNKNOWN] for word in s1 ]) s2List.append([ vocabDict[word] if word in vocabDict else vocabDict[UNKNOWN] for word in s2 ]) s1Mask.append(len(s1)) s2Mask.append(len(s2)) except: ValueError('Input Data Value Error!') s1Pad, s2Pad = pad_sequences(s1List, maxLen, padding='post'), pad_sequences(s2List, maxLen, padding='post') s1MaskList, s2MaskList = (s1Pad > 0).astype(np.int32), (s2Pad > 0).astype( np.int32) enc = OneHotEncoder(sparse=False) labelList = enc._fit_transform(labelList) s1Mask = np.asarray(s1Mask, np.int32) s2Mask = np.asarray(s2Mask, np.int32) labelList = np.asarray(labelList, np.int32) return s1Pad, s1Mask, s2Pad, s2Mask, labelList
def get_standard_data(left_data, right_data, y_data, max_length): left_arr = pad_sequences(left_data, maxlen=max_length, dtype='int32', padding='post', truncating='post', value=0) right_arr = pad_sequences(right_data, maxlen=max_length, dtype='int32', padding='post', truncating='post', value=0) enc = OneHotEncoder(n_values=2, dtype='float32') y_arr = enc._fit_transform(y_data).toarray() # OneHotVector # y_arr = np.array(y_data, dtype='int32') return left_arr, right_arr, y_arr
print(df) #自定义转换过程 df_new = df.copy() for col_num, col_name in enumerate(df): col_data = df[col_name] col_type = col_data.dtype if col_type == 'object': df_new = df_new.drop(col_name, 1) value_sets = col_data.unique() for value_unique in value_sets: col_name_new = col_name + '-' + value_unique col_tmp = df.iloc[:, col_num] new_col = (col_tmp == value_unique) df_new[col_name_new] = new_col print(df_new) #使用sklearn进行标志转换(哑变量编码) df2 = pd.DataFrame({ 'id': [3566841, 6541227, 3512441], 'sex': [1, 2, 2], 'level': [3, 1, 2] }) id_data = df2.values[:, :1] print(id_data) transform_data = df2.values[:, 1:] enc = OneHotEncoder() df2_new = enc._fit_transform(transform_data).toarray() df2_all = pd.concat((pd.DataFrame(id_data), pd.DataFrame(df2_new)), axis=1) print(df2_all)
Y = dataset.iloc[:, 3].values # print(X) # Handling the missing data imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) # Creating a dummy variable onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder._fit_transform(X).toarray() labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) # Splitting the dataset into training sets and Test sets from sklearn.cross_validation import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test)