dataarray = getArrayFromPattern(label) for x in xrange(len(dataarray)): featdict[featnamearray[x]][dataarray[x]] = 1 input_dim_v = 0 for featname, featvaluedict in featdict.items(): i = 0 flag = 0 if feattypedict[featname] != 1: continue for key in sorted(featvaluedict.keys()): if key != 'xx': featvaluedict[key] = i i += 1 else: flag = 1 if flag == 1: featvaluedict['xx'] = i i += 1 input_dim_v += i print "input vector dimension: %d" % input_dim_v # pprint.pprint(featdict) """ save the dictionary information """ output = open('featdict.pkl', 'wb') sPickle.dump((featnamearray, feattypedict, featdict), output) output.close()
tmp_N = np.array(lines[1:], dtype='float32') tmp_N = (tmp_N - X_mean_N) / X_std_N X_tmp_N.append(tmp_N) if REMOVE_SIL is True: Y_tmp = Y_tmp[np.array(idxarray)] assert (len(Y_tmp) == len(X_tmp_V)) if fidx <= nTrain: X_train_N.append(X_tmp_N) X_train_V.append(X_tmp_V) Y_train.append(list(Y_tmp)) else: X_test_N.append(X_tmp_N) X_test_V.append(X_tmp_V) Y_test.append(list(Y_tmp)) print(len(X_train_N), len(X_train_V), len(X_test_N), len(X_test_V), len(Y_train), len(Y_test)) output = open('sequence.pkl', 'wb') sPickle.dump(X_train_N, output) sPickle.dump(X_train_V, output) sPickle.dump(X_test_N, output) sPickle.dump(X_test_V, output) sPickle.dump(Y_train, output) sPickle.dump(Y_test, output) output.close() print "end"
for featname, featvaluedict in featdict.items(): i = 0 flag = 0 for key in sorted(featvaluedict.keys()): if key != 'xx': featvaluedict[key] = i i = i + 1 else: flag = 1 if flag == 1: featvaluedict['xx'] = i #pprint.pprint(featdict) output = open('featdict.pkl', 'wb') sPickle.dump(featdict, output) output.close() featurearrays = [] for dataarray in dataarrays: featurearray = [] for x in xrange(len(dataarray)): if feattypedict[featnamearray[x]] == 1: vector = [0 for y in featdict[featnamearray[x]].values()] vector[featdict[featnamearray[x]][dataarray[x]]] = 1 featurearray = featurearray + vector else: pass featurearrays.append(featurearray) numberfeatures = numpy.array(numberfeatures, dtype=numpy.float32)
X_tmp = [] Y_tmp = np.fromfile(cmpfile, dtype='float32') Y_tmp = Y_tmp.reshape(-1, OUTPUT_DIM) with open(labfile) as fp: for line in fp.readlines(): lines = line.strip().split() if(len(lines) < 3): break if (lines[0].find("-sil+")!=-1 and REMOVE_SIL is True): idxarray.append(False) pass else: idxarray.append(True) X_tmp.append(lines[1:]) assert(len(idxarray) == len(Y_tmp)) Y_tmp = ( Y_train += Y_tmp X_train += X_tmp X_mean = X_train.mean(axis=0) X_std = X_train.std(axis=0) Y_mean = Y_train.mean(axis=0) Y_std = Y_train.std(axis=0) output = open('Normalize.pkl', 'wb') sPickle.dump((X_mean, X_std, Y_mean, Y_std), output) output.close() print "end"
X_test_V += X_tmp_V Y_test += list(Y_tmp) X_train_N = np.array(X_train_N, dtype=np.float32) X_test_N = np.array(X_test_N, dtype=np.float32) X_train_V = np.array(X_train_V) X_test_V = np.array(X_test_V) Y_train = np.array(Y_train, dtype='float32') Y_test = np.array(Y_test, dtype='float32') X_mean_N = X_train_N.mean(axis=0) X_std_N = X_train_N.std(axis=0) Y_mean = Y_train.mean(axis=0) Y_std = Y_train.std(axis=0) sPickle.dump((X_mean_N, X_std_N, Y_mean, Y_std), open("normalize.pkl", "wb")) X_train_N = (X_train_N - X_mean_N) / X_std_N X_test_N = (X_test_N - X_mean_N) / X_std_N Y_train = (Y_train - Y_mean) / Y_std Y_test = (Y_test - Y_mean) / Y_std np.random.shuffle(X_train_V) np.random.seed(SEED) # reset seeds np.random.shuffle(X_train_N) print(X_train_N.shape, X_train_V.shape, X_test_N.shape, X_test_V.shape, Y_train.shape, Y_test.shape) output = open('data.pkl', 'wb') sPickle.dump(X_train_N, output)