Beispiel #1
0
def sentence2Index(dataPath, vocabDict, maxLen=100, lowercase=True):
    """
    :param dataPath: path of data file
    :param vocabDict: vocabulary dict {word : index}
    :param maxLen: max length of sentence, if a sentence longer than maxLen, cut off it
    :param lowercase: boolean, lower words or not
    :return: s1Pad: padded sentence1
             s2Pad: padded sentence2
             s1Mask: actual length of sentence1
             s2Mask: actual length of sentence2
    """
    s1List, s2List, labelList = [], [], []
    s1Mask, s2Mask = [], []
    with open(dataPath, mode='r', encoding='utf-8') as f:
        for line in f:
            try:
                l, s1, s2 = [v.strip() for v in line.strip().split('||')]
                if lowercase:
                    s1, s2 = s1.lower(), s2.lower()
                s1 = [v.strip() for v in s1.split()]
                s2 = [v.strip() for v in s2.split()]
                if len(s1) > maxLen:
                    s1 = s1[:maxLen]
                if len(s2) > maxLen:
                    s2 = s2[:maxLen]
                if l in CATEGORIE_ID:
                    labelList.append([CATEGORIE_ID[l]])
                    s1List.append([
                        vocabDict[word]
                        if word in vocabDict else vocabDict[UNKNOWN]
                        for word in s1
                    ])
                    s2List.append([
                        vocabDict[word]
                        if word in vocabDict else vocabDict[UNKNOWN]
                        for word in s2
                    ])
                    s1Mask.append(len(s1))
                    s2Mask.append(len(s2))
            except:
                ValueError('Input Data Value Error!')

    s1Pad, s2Pad = pad_sequences(s1List, maxLen,
                                 padding='post'), pad_sequences(s2List,
                                                                maxLen,
                                                                padding='post')
    s1MaskList, s2MaskList = (s1Pad > 0).astype(np.int32), (s2Pad > 0).astype(
        np.int32)
    enc = OneHotEncoder(sparse=False)
    labelList = enc._fit_transform(labelList)
    s1Mask = np.asarray(s1Mask, np.int32)
    s2Mask = np.asarray(s2Mask, np.int32)
    labelList = np.asarray(labelList, np.int32)
    return s1Pad, s1Mask, s2Pad, s2Mask, labelList
Beispiel #2
0
def get_standard_data(left_data, right_data, y_data, max_length):
    left_arr = pad_sequences(left_data,
                             maxlen=max_length,
                             dtype='int32',
                             padding='post',
                             truncating='post',
                             value=0)
    right_arr = pad_sequences(right_data,
                              maxlen=max_length,
                              dtype='int32',
                              padding='post',
                              truncating='post',
                              value=0)
    enc = OneHotEncoder(n_values=2, dtype='float32')
    y_arr = enc._fit_transform(y_data).toarray()  # OneHotVector
    # y_arr = np.array(y_data, dtype='int32')
    return left_arr, right_arr, y_arr
Beispiel #3
0
print(df)

#自定义转换过程
df_new = df.copy()
for col_num, col_name in enumerate(df):
    col_data = df[col_name]
    col_type = col_data.dtype
    if col_type == 'object':
        df_new = df_new.drop(col_name, 1)
        value_sets = col_data.unique()
        for value_unique in value_sets:
            col_name_new = col_name + '-' + value_unique
            col_tmp = df.iloc[:, col_num]
            new_col = (col_tmp == value_unique)
            df_new[col_name_new] = new_col
print(df_new)

#使用sklearn进行标志转换(哑变量编码)

df2 = pd.DataFrame({
    'id': [3566841, 6541227, 3512441],
    'sex': [1, 2, 2],
    'level': [3, 1, 2]
})
id_data = df2.values[:, :1]
print(id_data)
transform_data = df2.values[:, 1:]
enc = OneHotEncoder()
df2_new = enc._fit_transform(transform_data).toarray()
df2_all = pd.concat((pd.DataFrame(id_data), pd.DataFrame(df2_new)), axis=1)
print(df2_all)
Beispiel #4
0
Y = dataset.iloc[:, 3].values
# print(X)

# Handling the missing data
imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

# Creating a dummy variable
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder._fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

# Splitting the dataset into training sets and Test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)