Esempio n. 1
0
def train_model(X, y, clf):

    #split the dataset
    crossvalidation = cross_validation.StratifiedKFold(y, n_folds=5)

    #fit the model
    cms = []
    train_scores = []
    test_scores = []


    for train, test in crossvalidation:
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]

        X_train, X_test = impute_nan(X_train, X_test)
        X_train, X_test = normalize_features(X_train, X_test)
        #print(X_train[0])

        clf.fit(X_train, y_train)

        #evaluate the model
        train_score = clf.score(X_train, y_train)
        train_scores.append(train_score)
        test_score = clf.score(X_test, y_test)
        test_scores.append(test_score)

        y_predict = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_predict)
        cms.append(cm)

    return np.mean(test_scores), np.mean(train_scores), np.asarray(cms)
Esempio n. 2
0
def feature_extraction(audio_file, wl=20, ws=10, nf=24, nceps=19, fmin=0., fmax=4000., d_w=2, pre=0.97, mel=True):
    # Parameters used to extract MFCC (These can be defined in a separate configuration file)
    # wl = 20 # The window length in milliseconds
    # ws = 10 # The window shift of the in milliseconds
    # nf = 24 # The number of filter bands
    # nceps = 19 # The number of cepstral coefficients
    # fmin = 0. # The minimal frequency of the filter bank
    # fmax = 4000. # The maximal frequency of the filter bank
    # d_w = 2 # The delta value used to compute 1st and 2nd derivatives
    # pre = 0.97 # The coefficient used for the pre-emphasis
    # mel = True # Tell whether MFCC or LFCC are extracted

    # We could also add 1st and 2nd derivatives by activating their flags!

    # read the audio file
    #(rate, signal) = utils.read(audio_file)
    (rate, signal) = utils.load_wav_as_mono(audio_file)

    # extract MFCCsu
    ceps = bob.ap.Ceps(rate, wl, ws, nf, nceps, fmin, fmax, d_w, pre, mel)

    #Convert signal to float array
    signal = numpy.cast['float'](signal)
    mfcc = ceps(signal)

    # let's just normalize them using this helper function
    # This will reduce the effect of the channel
    mfcc = utils.normalize_features(mfcc)

    return mfcc
Esempio n. 3
0
def train_model(X, y, c):
    svm_clf = SVC(kernel='linear', C=c)

    crossvalidation = cross_validation.StratifiedKFold(y, n_folds=5)

    #fit the model
    clfs = []
    cms = []
    train_scores = []
    test_scores = []

    for train, test in crossvalidation:
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]

        X_train, X_test = normalize_features(X_train, X_test)

        svm_clf.fit(X_train, y_train)

        train_score = svm_clf.score(X_train, y_train)
        train_scores.append(train_score)

        test_score = svm_clf.score(X_test, y_test)
        test_scores.append(test_score)

        y_predict = svm_clf.predict(X_test)
        cm = confusion_matrix(y_test, y_predict)
        cms.append(cm)

    return np.mean(test_scores), np.mean(train_scores), np.asarray(cms)
Esempio n. 4
0
def train_nolearn_model(X, y):
    '''
        NeuralNet with nolearn
    '''
    X = X.astype(np.float32)
    y = y.astype(np.int32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5)
    X_train, X_test = impute_nan(X_train, X_test)
    X_train, X_test = normalize_features(X_train, X_test)

    lays = [('input', layers.InputLayer),
              ('hidden', layers.DenseLayer),
              ('output', layers.DenseLayer),
             ]

    net = NeuralNet(
        layers = lays,
        input_shape=(None, 23),
        hidden_num_units=10,
        objective_loss_function=lasagne.objectives.categorical_crossentropy,
        output_nonlinearity=lasagne.nonlinearities.sigmoid,
        output_num_units=10,


        update = nesterov_momentum,
        update_learning_rate= 0.001,
        update_momentum=0.9,

        max_epochs=10,
        verbose=1,
        )
    #net.fit(X_train, y_train)
    #predicted = net.predict(X_test)
    test_score = net.predict(X_test, y_test)
    train_score = net.score(X_train, y_train)
    return train_score, test_score
Esempio n. 5
0
def train_sknn(X, y):
    '''
        NeuralNet with sknn
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 5)
    X_train, X_test = impute_nan(X_train, X_test)
    X_train, X_test = normalize_features(X_train, X_test)
    nn = Classifier(
        layers=[
            Layer("Tanh", units=12),
            Layer("Softmax")],
        learning_rate=0.005,
        n_iter=25)

    # gs = GridSearchCV(nn, param_grid={
    #     'learning_rate': [0.05, 0.01, 0.005, 0.001],
    #     'hidden0__units': [4, 8, 12,100],
    #     'hidden0__type': ["Rectifier", "Sigmoid", "Tanh"]})
    # gs.fit(X_train, y_train)
    # print(gs.best_estimator_)
    nn.fit(X_train, y_train)
    predicted = nn.predict(X_test).flatten()
    labels = y_test
    return predicted, labels
Esempio n. 6
0
def test7():
    print("\n\nTest 7 - K-Means Clustering & PCA")
    print("Expected / Actual:")

    print("\nCentroid assignment:")
    raw = ut.read_mat_raw('mat/ex7data2.mat')
    X = raw['X']
    K = 3
    mu = np.array([[3, 3], [6, 2], [8, 5]])
    idx = km.assign_centroids(X, mu)
    print("0 /", idx[0])
    print("2 /", idx[1])
    print("1 /", idx[2])
    print("1 /", idx[-3])
    print("1 /", idx[-2])
    print("0 /", idx[-1])

    print("\nCentroid adjustment:")
    mu = km.adjust_centroids(X, idx, K)
    print("2.428301 /", mu[0, 0])
    print("3.157924 /", mu[0, 1])
    print("5.813503 /", mu[1, 0])
    print("2.633656 /", mu[1, 1])
    print("7.119387 /", mu[2, 0])
    print("3.616684 /", mu[2, 1])

    print("\nPixel clustering:")
    A = mpl.imread('img/bird_small.png')
    # mpl.imshow(A, extent=[0, 1, 0, 1])
    # mpl.colorbar()
    # mpl.show()
    imgsz = A.shape
    A = A.reshape(imgsz[0] * imgsz[1], imgsz[2])
    K = 16
    iter = 10

    mu, idx = km.clusterize(A, K, iter)
    min = km.compute_cost(A, mu, idx)
    print("Iteration %d cost - %.10f" % (0, min))
    for i in range(1, 1):
        mu_tmp, idx_tmp = km.clusterize(A, K, iter)
        curr = km.compute_cost(A, mu_tmp, idx_tmp)
        print("Iteration %d cost - %.10f" % (i, curr))
        if (curr < min):
            min = curr
            mu = mu_tmp
            idx = idx_tmp

    print("Minimum cost found - %.10f" % min)

    A_new = mu[idx].reshape(imgsz[0], imgsz[1], imgsz[2])
    # mpl.imshow(A_new, extent=[0, 1, 0, 1])
    # mpl.colorbar()
    # mpl.show()

    print("\nPrincipal Component Analysis: ")
    raw = ut.read_mat_raw('mat/ex7data1.mat')
    X = raw['X']
    X = ut.normalize_features(X)[0].T
    U, S = alg.PCA(X)
    print("0.707 /", np.abs(U[0, 0]))
    print("0.707 /", np.abs(U[1, 0]))

    K = 1
    Z = alg.project_data(X, U, K)
    print("1.481 / ", np.abs(Z[0, 0]))

    X_app = alg.recover_data(Z, U, K)
    print("1.047 / ", np.abs(X_app[0, 0]))
    print("1.047 / ", np.abs(X_app[1, 0]))
    # print(X_app.T)
    print("% Variance =", alg.compute_retention(S, K))
Esempio n. 7
0
def test1():
    print("\n\nTest 1 - Linear Regression")
    print("Expected / Actual:")

    print("\nBatch gradient descent: ")
    X, y = ut.read_csv('csv/ex1data1.csv')
    X = ut.create_design(X)
    theta = np.zeros((X.shape[1], ))
    iterations = 1500
    alpha = 0.01
    print("32.0727 / ", alg.SSD(theta, X, y))
    print("52.2425 / ", alg.SSD(np.array([-1, 2]), X, y))
    alg.batch_gd(X, y, theta, alpha, iterations, alg.SSD_gradient)
    print("-3.630291 / ", theta[0])
    print("1.166362 / ", theta[1])
    print("34962.991574 / ",
          ut.predict(np.array([[6.1101]]), theta)[0] * 10**4)
    print("45342.450129 / ", ut.predict(np.array([[7]]), theta)[0] * 10**4)

    print("\nWith optimization: ")
    theta = np.zeros((X.shape[1]), )
    res = opt.minimize(alg.SSD,
                       theta, (X, y),
                       jac=alg.SSD_gradient,
                       method='Newton-CG',
                       options={"maxiter": 1500})
    theta = res.x
    print("-3.630291 / ", theta[0])
    print("1.166362 / ", theta[1])
    print("34962.991574 / ",
          ut.predict(np.array([[6.1101]]), theta)[0] * 10**4)
    print("45342.450129 / ", ut.predict(np.array([[7]]), theta)[0] * 10**4)

    print("\nNormalized batch gradient descent:")
    X, y = ut.read_csv('csv/ex1data2.csv')
    X, mu, sigma = ut.normalize_features(X)
    X = ut.create_design(X)
    alpha = 0.1
    iterations = 400
    theta = np.zeros((X.shape[1], ))
    alg.batch_gd(X, y, theta, alpha, iterations, alg.SSD_gradient)
    print("2000.680851 / ", mu[0])
    print("3.170213 / ", mu[1])
    print("794.7024 / ", sigma[0])
    print("0.7610 / ", sigma[1])
    print("340412.659574 / ", theta[0])
    print("110631.048958 / ", theta[1])
    print("-6649.472950 / ", theta[2])

    print("\nNormal equation:")
    X, y, = ut.read_csv('csv/ex1data2.csv')
    X = ut.create_design(X)
    alg.normal_eqn(X, y)
    print("340412.659574 / ", theta[0])
    print("110631.048958 / ", theta[1])
    print("-6649.472950 / ", theta[2])

    print("\nNormalized prediction:")
    print("293081.464622 / ",
          ut.predict(np.array([[1650, 3]]), theta, mu, sigma)[0])
    print("284343.447245 / ",
          ut.predict(np.array([[1650, 4]]), theta, mu, sigma)[0])

    return
Esempio n. 8
0
X_test_node2 = path+'motifs/'+day+'_thres_'+thres+'_test_'+contacts+'_out_node2_3_0.0001/mat.npy'
X_test_window = path+'motifs/'+day+'_thres_'+thres+'_test_'+contacts+'_out_btw_nodes_3_0.0001/mat.npy'

X_valid_node1 = path+'motifs/'+day+'_thres_'+thres+'_valid_'+contacts+'_out_node1_3_0.0001/mat.npy'
X_valid_node2 = path+'motifs/'+day+'_thres_'+thres+'_valid_'+contacts+'_out_node2_3_0.0001/mat.npy'
X_valid_window = path+'motifs/'+day+'_thres_'+thres+'_valid_'+contacts+'_out_btw_nodes_3_0.0001/mat.npy'

y_train = get_labels(path+day+'_y_train_thres_'+thres+'.npy')
y_test = get_labels(path+day+'_y_test_thres_'+thres+'.npy')
y_valid = get_labels(path+day+'_y_valid_thres_'+thres+'.npy')

X_train=concat_motifs([X_train_node1, X_train_node2])
X_test=concat_motifs([X_test_node1, X_test_node2])
X_valid=concat_motifs([X_valid_node1, X_valid_node2])

X_train_normalized_pairs, X_valid_normalized_pairs, X_test_normalized_pairs = normalize_features(X_train, X_valid, X_test)

X_train_normalized = X_train_normalized_pairs.reshape(X_train_normalized_pairs.shape[0],X_train_normalized_pairs.shape[2]*X_train_normalized_pairs.shape[3])
X_valid_normalized = X_valid_normalized_pairs.reshape(X_valid_normalized_pairs.shape[0],X_valid_normalized_pairs.shape[2]*X_valid_normalized_pairs.shape[3])
X_test_normalized = X_test_normalized_pairs.reshape(X_test_normalized_pairs.shape[0],X_test_normalized_pairs.shape[2]*X_test_normalized_pairs.shape[3])

X_train_valid = np.concatenate((X_train_normalized, X_valid_normalized), axis=0)
y_train_valid = np.concatenate((y_train, y_valid), axis=0)

# test_fold to 0 for all samples that are part of the validation set, and to -1 for all other samples.
valid_index=[-1 for i in range(X_train_normalized.shape[0])]+[0 for i in range(X_valid_normalized.shape[0])]

param_grid = {'gamma': [1e-3, 1e-4, 0.005, 0.05, 0.5],'C': [1, 10, 100]}
best_param={}
svm = Genome3D_SVM_RBF(best_param)
best_param = svm.train_cross_val(X_train_valid[:,:], [i for i in y_train_valid[:,0]], valid_index, param_grid)
    cost = np.sum(np.multiply(y, np.log(sigmoid(X.dot(theta)))) + np.multiply((1 - y), np.log(1 - sigmoid(X.dot(theta))))) / -m + \
                l2_lambda / (2 * m) * np.sum(np.power(theta[1:], 2)) / (2 * m) # cost + regularization parameter
    gradient = X.T.dot(sigmoid(X.dot(theta)) -
                       y) / m + l2_lambda / m * np.vstack([0, theta[1:]])
    return cost, gradient


if __name__ == "__main__":
    # Load data from the file
    data = np.matrix(
        np.genfromtxt('logistic_regression/data/data.csv', delimiter=','))
    X = data[:, 0:2]
    y = data[:, 2]
    # Initial setup
    alpha = 5
    X = normalize_features(X)  # normalize features
    X = np.append(np.ones((X.shape[0], 1)), X, axis=1)
    theta = np.zeros((X.shape[1], 1))
    l2_lambda = 0.1
    costs = np.array([])
    iterations = 0
    # Calculate the changing theta
    while True:
        # Re-calculate theta
        cost, gradient = calculate_cost_and_gradient(theta, X, y, l2_lambda)
        theta = theta - alpha * gradient
        print('Cost: %f' % (cost))
        print("Theta: [ %f, %f, %f ]" %
              (theta[0, 0], theta[1, 0], theta[2, 0]))
        #Plot the dependence of cost vs iteration
        plt.clf()
from scipy.io import wavfile

audioname = "1brian.wav"
fs, signal = wavfile.read(audioname)
#fs=8000
mt_size = 2.0
mt_step = 0.2
st_win = 0.05

st_step = st_win

#FEATURE EXTRACTION AND NORMALISATION
[mid_term_features,
 short_term_features] = mt_feature_extraction(signal, fs, mt_size * fs,
                                              mt_step * fs, round(fs * st_win))
[mid_term_features_norm, _, _] = normalize_features([mid_term_features.T])
mid_term_features_norm = mid_term_features_norm[0].T
num_of_windows = mid_term_features.shape[1]

# VAD
reserved_time = 1
segment_limits = vad(short_term_features,
                     st_step,
                     smooth_window=0.5,
                     weight=0.3)
i_vad = ivad(segment_limits, mt_step, reserved_time, num_of_windows)
mid_term_features_norm = mid_term_features_norm[:, i_vad]

# remove outliers:
distances_all = numpy.sum(distance.squareform(
    distance.pdist(mid_term_features_norm.T)),
Esempio n. 11
0
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
from utils import normalize_features

file = "ex7data1.mat"
current_dir = os.path.abspath(".")
data_dir = join(current_dir, 'data')
file_name = join(data_dir, file)
mat_dict = sio.loadmat(file_name)
print("mat_dict.keys() : ", mat_dict.keys())
X = mat_dict["X"]
m = X.shape[0]

# remove mean and feature scaling is recommended by Andrew Ng
X_normalized = normalize_features(X)

# m = x1.size
sigma = (1 / m) * np.dot(X_normalized.T,
                         X_normalized)  # nxn (n is 2 : number of features)


def plot(_X, title):
    x1 = _X[:, 0]
    x2 = _X[:, 1]
    plt.plot(x1, x2, 'o')
    plt.title(title)
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.grid()
    plt.show()
Esempio n. 12
0
def dayday_feature(data, n_class=2, label_most_common_1=19, flag_unlabel=0):
    t1 = time.time()
    data = data.copy()
    x = data['fea_table'].copy()
    num_nodes = x.shape[0]
    nodes_all = list(x.index)
    df = data['edge_file'].copy()
    max_weight = max(df['edge_weight'])
    df.rename(columns={'edge_weight': 'weight'}, inplace=True)

    degree_in_1st = np.zeros(num_nodes)
    degree_out_1st = np.zeros(num_nodes)
    weight_in_1st = np.zeros(num_nodes)
    weight_out_1st = np.zeros(num_nodes)
    for source, target, weight in df.values:
        source = int(source)
        target = int(target)
        degree_in_1st[target] += 1
        degree_out_1st[source] += 1
        weight_in_1st[target] += weight
        weight_out_1st[source] += weight

    degree_1st_diff = degree_in_1st - degree_out_1st
    weight_1st_diff = weight_in_1st - weight_out_1st

    features_1 = np.concatenate([
        degree_in_1st.reshape(-1, 1),
        degree_out_1st.reshape(-1, 1),
        weight_in_1st.reshape(-1, 1),
        weight_out_1st.reshape(-1, 1),
        degree_1st_diff.reshape(-1, 1),
        weight_1st_diff.reshape(-1, 1)
    ],
                                axis=1)

    features_in_1st = pd.DataFrame({
        "node_index": np.arange(num_nodes),
        "degree_in_1st": degree_in_1st,
        "weight_in_1st": weight_in_1st
    })
    df_degree_in_1st = pd.merge(left=df,
                                right=features_in_1st,
                                left_on="src_idx",
                                right_on="node_index",
                                how="left")
    df_degree_in_1st_info = df_degree_in_1st.groupby(
        'dst_idx')['degree_in_1st'].agg({
            'degree_in_1st_sum': np.sum,
            'degree_in_1st_mean': np.mean,
            'degree_in_1st_min': np.min,
            'degree_in_1st_max': np.max,
            'degree_in_1st_median': np.median
        })
    df_weight_in_1st_info = df_degree_in_1st.groupby(
        'dst_idx')['weight_in_1st'].agg({
            'weight_in_1st_sum': np.sum,
            'weight_in_1st_mean': np.mean,
            'weight_in_1st_min': np.min,
            'weight_in_1st_max': np.max,
            'weight_in_1st_median': np.median
        })

    df_degree_in_2nd = pd.DataFrame({
        "node_index":
        df_degree_in_1st_info.index,
        "degree_in_2nd":
        df_degree_in_1st_info['degree_in_1st_sum']
    })
    df_degree_in_2nd = pd.merge(left=df,
                                right=df_degree_in_2nd,
                                how="left",
                                left_on="src_idx",
                                right_on="node_index")
    df_degree_in_2nd_info = df_degree_in_2nd.groupby(
        'dst_idx')['degree_in_2nd'].agg({
            'degree_in_2nd_sum': np.sum,
            'degree_in_2nd_mean': np.mean,
            'degree_in_2nd_min': np.min,
            'degree_in_2nd_max': np.max,
            'degree_in_2nd_median': np.median
        })
    features_2_index = df_degree_in_1st_info.index
    features_2_t = np.hstack([
        df_degree_in_1st_info.values, df_weight_in_1st_info.values,
        df_degree_in_2nd_info.values
    ])
    features_2 = np.zeros((num_nodes, features_2_t.shape[1]))
    for i, index in enumerate(features_2_index):
        features_2[index] = features_2_t[i]

    train_y = data['train_label'].copy()
    df_info_in = pd.merge(left=df,
                          right=train_y,
                          how='left',
                          left_on='src_idx',
                          right_on='node_index')
    if flag_unlabel == 0:
        df_info_in.dropna(inplace=True)
    else:
        df_info_in.fillna(-1, inplace=True)

    df_labels_in_count = df_info_in.pivot_table(index=["dst_idx"],
                                                columns='label',
                                                aggfunc='size',
                                                fill_value=0)
    df_labels_in_precent = pd.crosstab(index=df_info_in.dst_idx,
                                       columns=df_info_in.label,
                                       normalize='index')

    df_labels_in_without_most_common = df_info_in.copy()
    df_labels_in_without_most_common = df_labels_in_without_most_common[
        df_labels_in_without_most_common.label != label_most_common_1]
    df_labels_in_precent_without_most_common = pd.crosstab(
        index=df_labels_in_without_most_common.dst_idx,
        columns=df_labels_in_without_most_common.label,
        normalize='index')

    df_labels_weight_count_in = df_info_in.pivot_table(index=['dst_idx'],
                                                       columns='label',
                                                       values='weight',
                                                       aggfunc='sum',
                                                       fill_value=0)
    df_labels_weight_percent_in = pd.crosstab(index=df_info_in.dst_idx,
                                              columns=df_info_in.label,
                                              values=df_info_in.weight,
                                              aggfunc='sum',
                                              normalize='index')

    df_labels_weight_percent_in_without_most_common = pd.crosstab(
        index=df_labels_in_without_most_common.dst_idx,
        columns=df_labels_in_without_most_common.label,
        values=df_labels_in_without_most_common.weight,
        aggfunc='sum',
        normalize='index')

    features_3_index = list(df_labels_in_count.index)
    features_3_t = np.hstack(
        (df_labels_in_count.values, df_labels_in_precent.values,
         df_labels_weight_count_in.values, df_labels_weight_percent_in.values))
    features_3 = np.zeros((num_nodes, features_3_t.shape[1]))
    for i, index in enumerate(features_3_index):
        features_3[index] = features_3_t[i]

    labels_in_temp = features_3[:, :n_class]
    labels_weight_in_temp = features_3[:, 2 * n_class:3 * n_class]
    features_labels_all_in_2nd = np.zeros((num_nodes, n_class))
    features_labels_weight_all_in_2nd = np.zeros((num_nodes, n_class))
    for source, target, weight in df.values:
        source = int(source)
        target = int(target)
        features_labels_all_in_2nd[source] += labels_in_temp[target]
        features_labels_weight_all_in_2nd[source] += labels_weight_in_temp[
            target]
    features_labels_all_in_2nd_percent = np.delete(features_labels_all_in_2nd,
                                                   label_most_common_1,
                                                   axis=1)
    features_labels_all_in_2nd_percent = normalize_features(
        features_labels_all_in_2nd_percent)

    features_out_1st = pd.DataFrame({
        "node_index": np.arange(num_nodes),
        "degree_out_1st": degree_out_1st,
        "weight_out_1st": weight_out_1st
    })
    df_degree_out_1st = pd.merge(left=df,
                                 right=features_out_1st,
                                 left_on="dst_idx",
                                 right_on="node_index",
                                 how="left")
    df_degree_out_1st_info = df_degree_out_1st.groupby(
        'src_idx')['degree_out_1st'].agg({
            'degree_out_1st_sum': np.sum,
            'degree_out_1st_mean': np.mean,
            'degree_out_1st_min': np.min,
            'degree_out_1st_max': np.max,
            'degree_out_1st_median': np.median
        })
    df_weight_out_1st_info = df_degree_out_1st.groupby(
        'src_idx')['weight_out_1st'].agg({
            'weight_out_1st_sum': np.sum,
            'weight_out_1st_mean': np.mean,
            'weight_out_1st_min': np.min,
            'weight_out_1st_max': np.max,
            'weight_out_1st_median': np.median
        })

    df_degree_out_2nd = pd.DataFrame({
        "node_index":
        df_degree_out_1st_info.index,
        "degree_out_2nd":
        df_degree_out_1st_info['degree_out_1st_sum']
    })
    df_degree_out_2nd = pd.merge(left=df,
                                 right=df_degree_out_2nd,
                                 how="left",
                                 left_on="dst_idx",
                                 right_on="node_index")
    df_degree_out_2nd_info = df_degree_out_2nd.groupby(
        'src_idx')['degree_out_2nd'].agg({
            'degree_out_2nd_sum': np.sum,
            'degree_out_2nd_mean': np.mean,
            'degree_out_2nd_min': np.min,
            'degree_out_2nd_max': np.max,
            'degree_out_2nd_median': np.median
        })
    features_4_index = df_degree_out_1st_info.index
    features_4_t = np.hstack([
        df_degree_out_1st_info.values, df_weight_out_1st_info.values,
        df_degree_out_2nd_info.values
    ])
    features_4 = np.zeros((num_nodes, features_4_t.shape[1]))
    for i, index in enumerate(features_4_index):
        features_4[index] = features_4_t[i]

    df_info_out = pd.merge(left=df,
                           right=train_y,
                           how='left',
                           left_on='dst_idx',
                           right_on='node_index')
    if flag_unlabel == 0:
        df_info_out.dropna(inplace=True)
    else:
        df_info_out.fillna(-1, inplace=True)

    df_labels_out_count = df_info_out.pivot_table(index=["src_idx"],
                                                  columns='label',
                                                  aggfunc='size',
                                                  fill_value=0)
    df_labels_out_precent = pd.crosstab(index=df_info_out.src_idx,
                                        columns=df_info_out.label,
                                        normalize='index')

    df_labels_out_without_most_common = df_info_out.copy()
    df_labels_out_without_most_common = df_labels_out_without_most_common[
        df_labels_out_without_most_common.label != label_most_common_1]
    df_labels_out_precent_without_most_common = pd.crosstab(
        index=df_labels_out_without_most_common.src_idx,
        columns=df_labels_out_without_most_common.label,
        normalize='index')

    df_labels_weight_count_out = df_info_out.pivot_table(index=['src_idx'],
                                                         columns='label',
                                                         values='weight',
                                                         aggfunc='sum',
                                                         fill_value=0)
    df_labels_weight_percent_out = pd.crosstab(index=df_info_out.src_idx,
                                               columns=df_info_out.label,
                                               values=df_info_out.weight,
                                               aggfunc='sum',
                                               normalize='index')
    df_labels_weight_percent_out_without_most_common = pd.crosstab(
        index=df_labels_out_without_most_common.src_idx,
        columns=df_labels_out_without_most_common.label,
        values=df_labels_out_without_most_common.weight,
        aggfunc='sum',
        normalize='index')

    features_5_index = list(df_labels_out_count.index)
    features_5_t = np.hstack(
        (df_labels_out_count.values, df_labels_out_precent.values,
         df_labels_weight_count_out.values,
         df_labels_weight_percent_out.values))
    features_5 = np.zeros((num_nodes, features_5_t.shape[1]))
    for i, index in enumerate(features_5_index):
        features_5[index] = features_5_t[i]

    features_merge = np.concatenate([
        features_1, features_2, features_3, features_4, features_5,
        features_labels_all_in_2nd, features_labels_all_in_2nd_percent
    ],
                                    axis=1)
    features_merge = np.unique(features_merge, axis=1)
    features_merge = np.delete(
        features_merge,
        np.argwhere(np.sum(features_merge, axis=0) == 0),
        axis=1)

    return features_merge
Esempio n. 13
0
        print("Params: lr={:.4f}, epochs={}, weight_decay={:.5f}, patience={}, hidden_size={}, num_layers={}, package={}, dataset={}"\
    .format(args.lr, args.epochs, args.weight_decay, args.patience, args.hidden_size, args.num_layers, args.package, args.dataset))

    if args.dataset == "cora":
        data = load_data("data/cora/cora.pkl")

    adj = data['adj']
    features = data['features']
    y_train = data['y_train']
    y_val = data['y_val']
    y_test = data['y_test']
    train_mask = data['train_index']
    val_mask = data['val_index']
    test_mask = data['test_index']
    adj = normalize_adj(adj)
    features = normalize_features(features)

    if args.package == "numpy":
        features = features.toarray()
        adj = adj.toarray()
    elif args.package == "ctf":
        y_train = ctf.astensor(y_train)
        y_val = ctf.astensor(y_val)
        y_test = ctf.astensor(y_test)

        features = features.toarray()
        adj = adj.toarray()
        adj = ctf.astensor(adj)
        features = ctf.astensor(features)

        adj = ctf.tensor(sp=True, copy=adj)