Example #1
0
def main():
    response_columns = []
    columns = []
    filename = sys.argv[1]
    #columns = sys.argv[2].strip('[]').split(',')
    response_columns_classification = sys.argv[2].strip('[]').split(', ')
    response_columns_predictin = sys.argv[3].strip('[]').split(', ')

    #features
    columns = [
        'MO', 'N', 'INDICE20', 'PHEAU', 'PHSMP', 'KECH', 'CAECH', 'MGECH',
        'NAECH', 'HECH', 'CEC', 'PM3', 'MNM3', 'CUM3', 'FEM3', 'ALM3', 'BM3',
        'ZNM3', 'PBM3', 'MOM3', 'CDM3', 'COM3', 'CRM3', 'KM3', 'CAM3', 'MGM3',
        'NAM3'
    ]
    columns_M3 = columns[1:5] + [columns[9]] + columns[11:]
    columns = columns_M3

    #single target classification
    for response_target in response_columns_classification:
        df = read_data(filename, columns, response_target)
        data_view(df, columns, response_target)
        classification(df, columns, response_target)
    #regression
    df = read_data(filename, columns, response_columns_predictin)
    df.dropna(inplace=True)
    prediction(df, columns, response_columns_predictin)
def evaluate_accuracy(task_id, dificulty, errors=False, outputFile='result.txt', silent=False):
    if not silent:
        print('reading and processing testing data')
    X, Y = read_data(training=False, task_id=task_id, difficulty=dificulty)
    if not silent:
        print("Getting representation")
    # representation = _classifier.get_representation(task_id)
    # if representation is None:
    #        representation = _classifier.default_representation(Y)
    Y = _classifier.labels_remove_twos(Y)
    representation = _classifier.find_representation(Y)

    print('Representation for accuracy: ', representation)
    if not silent:
        print('read')
        print('predicting..')

    raw_predicted = nn.get_model(tasks_encoded[(task_id, dificulty)]).predict(X)
    
    # _classifier.print_labels(raw_predicted[:5])
    predicted = _classifier.get_normal_output(raw_predicted, representation)
    # print(predicted.shape)
    acc = accuracy(predicted, Y, raw_predicted, errors)
    print('Accuracy %.4f' % acc)
    return acc
def check():

    #raw_labels = np.array([[0.0, 0.142, 0.542, 0.001, 0.0, 0.13, 0.124, 0.0, 0.0, 0.061000001]])
    #r#epresentation = [(8,1)]
    #Y, _ = tra#nsform_labels_with_representation(labels_remove_twos(Y), 4)
    _, Y = read_data(task_id=3, difficulty=2)
    Y = labels_remove_twos(Y)
    rep = find_representation(Y)
    print(rep)
Example #4
0
 def read_data_sheet(self):
     datafile = frame_1.datafile.GetValue()
     if not os.path.exists(datafile):
         self.popup_box("Can't find "+datafile, "Can't find "+datafile)
         return
     #print all_genes_and_traits
     #data_sheet.update(readdata.read_data(datafile))
     data_list, column_labels = readdata.read_data(datafile)
     self.data_sheet.extend(data_list)
     #column_labels = data_sheet.keys()
     self.all_genes_and_traits.extend(column_labels)
     #print "all", all_genes_and_traits
     #all_genes_and_traits.sort()
     frame_1.gene_list.Set(self.all_genes_and_traits)
     frame_1.trait_list.Set(self.all_genes_and_traits)
     # assume that the selection variable is in first columns 
     frame_1.selection_variable_list.SetItems(self.all_genes_and_traits[0:20]) 
Example #5
0
def main():
    # uncomment this to create new model
    prepare_training_set()
    # return
    # load model back
    clf = joblib.load('model.pkl')

    # read sample
    image = read_data(sys.stdin)

    # extract the characters
    characters = extract_characters(image)

    for character in characters:
        prediction = clf.predict(character.ravel().reshape(1, -1))
        sys.stdout.write(prediction[0])

    sys.stdout.write('\n')
def get_original_output(task_id, difficulty):
    X, Y = read_data(training=False, task_id=task_id, difficulty=difficulty)
    Y = _classifier.labels_remove_twos(Y)
    representation = _classifier.find_representation(Y)
    raw_predicted = nn.get_model(tasks_encoded[(task_id, difficulty)]).predict(X)
    predicted = _classifier.get_normal_output(raw_predicted, representation)

    def transform_single_label(label, how):
        i = 0
        new = []
        for id in range(len(how)):
            if how[id]==2:
                new.append(0)
            else:
                new.append(label[i])
                i  += 1
        return how
    return list(map(lambda label: transform_single_label(label, Y[0]), predicted))
def model_statistics_on_task(task_id, difficulty, epochs):
    # train for 1
    X_, Y = read_data(task_id=task_id, difficulty=difficulty)
    X_, Y, _ = transform_data(X_, Y, task_id)
    nn.construct_model(X_[0].shape)
    nn.add_new_task(len(Y[0]))

    model = nn.get_model().model
    X = nn.get_model().make_input(X_)

    tasks_encoded[(task_id, difficulty)] = len(nn.tasks) - 1
    acc_history = []
    for epoch in range(epochs + 1):
        print('Epoch #', epoch)
        if epoch > 0:
            model.fit(X, Y, nb_epoch=1)
        acc_history.append(evaluate_accuracy(task_id, difficulty, silent=True))
    return acc_history
Example #8
0
 def read_data_sheet(self):
     datafile = frame_1.datafile.GetValue()
     if not os.path.exists(datafile):
         self.popup_box("Can't find " + datafile, "Can't find " + datafile)
         return
     #print all_genes_and_traits
     #data_sheet.update(readdata.read_data(datafile))
     data_list, column_labels = readdata.read_data(datafile)
     self.data_sheet.extend(data_list)
     #column_labels = data_sheet.keys()
     self.all_genes_and_traits.extend(column_labels)
     #print "all", all_genes_and_traits
     #all_genes_and_traits.sort()
     frame_1.gene_list.Set(self.all_genes_and_traits)
     frame_1.trait_list.Set(self.all_genes_and_traits)
     # assume that the selection variable is in first columns
     frame_1.selection_variable_list.SetItems(
         self.all_genes_and_traits[0:20])
def train_network_ui(task_id, difficulty, epochs=3):
    # tasks[(task_id, difficulty)] = len(tasks.keys())
    X, Y = read_data(task_id=task_id, difficulty=difficulty)
    train_network(X, Y, epochs=epochs, train=True, load_model=False, filename=['model.txt', 'weights.hdf5'],
                  task_id=task_id)
    tasks_encoded[(task_id, difficulty)] = len(nn.tasks) - 1
    old_accuracy = evaluate_accuracy(task_id, difficulty, silent=True)
    print('Old accuracy: ', old_accuracy)
    if len(nn.tasks)>1:
        buffered = nn.tasks[-1]
        nn.tasks.pop()
        train_network(X, Y, epochs=epochs, train=True, load_model=False, filename=['model.txt', 'weights.hdf5'],
                      task_id=task_id, independent=True)
        new_accuracy = evaluate_accuracy(task_id, difficulty, silent=True)
        print('New accuracy: ', new_accuracy)
        if new_accuracy<old_accuracy:
            nn.tasks[-1] = buffered
        old_accuracy = new_accuracy
    if old_accuracy<0.9:
        nn.tasks[-1].kill()
def evaluate_accuracy(task_id, difficulty, errors=False, outputFile='result.txt'):
    print('reading and processing testing data')
    X, Y = read_data(training=False, task_id=task_id, difficulty=difficulty)
    print("Getting representation")

    print('read')
    Y = _classifier.labels_remove_twos(Y)
    print('predicting..')
    representation = _classifier.find_representation(Y)

    if (task_id, difficulty) in tasks.keys():
        model = nn.get_model(tasks[(task_id, difficulty)])
    else:
        raise Exception('Task unknown')
        return

    raw_predicted = model.predict(X, verbose=1)
    
    predicted = _classifier.get_normal_output(raw_predicted, representation)
    acc = accuracy(predicted, Y, raw_predicted, errors)
    print('Accuracy %.4f' % acc)
    return acc
Example #11
0
from matplotlib.pyplot import *
from numpy             import *
from Green_function    import Green_func
from readdata          import read_data


(rs,r0,R2,kx,kt,Rm2,d,h3,h2,h1,rmax,Q,M,v,muq) = read_data()

#kx = [0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0]
#kx = [0.0,0.5,1.5,2.5,5.0,10.0]
kx = 0.0
kt = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
Scale = 1
Gr_f = zeros(len(kt),complex)
for i in range(0,len(kt)):
   Gr_f[i] = Green_func(rs,r0,R2,kx,kt[i],Rm2,d,h3,h2,h1,rmax,Q,M,v,muq,Scale)
   print Gr_f[i]
print Scale
figure(1)
plot(kt,Gr_f.imag,label='Scale = %f'%(kx))
######################################################################################
kx = 5.0
for i in range(0,len(kt)):
   Gr_f[i] = Green_func(rs,r0,R2,kx,kt[i],Rm2,d,h3,h2,h1,rmax,Q,M,v,muq,Scale)
   print Gr_f[i]
print Scale
plot(kt,Gr_f.imag,label='Scale = %f'%(kx))
######################################################################################
kx = 10.0
for i in range(0,len(kt)):
   Gr_f[i] = Green_func(rs,r0,R2,kx,kt[i],Rm2,d,h3,h2,h1,rmax,Q,M,v,muq,Scale)
def train_network_ui(task_id, difficulty, epochs=3):
    tasks[(task_id, difficulty)] = len(tasks.keys())
    X, Y = read_data(task_id=task_id, difficulty=difficulty)
    return train_network(X, Y, epochs=epochs, train=True, load_model=False, filename=['model.txt', 'weights.hdf5'])
Example #13
0
import numpy as np
import sklearn.linear_model as linear_model

from LogisticRegression import LogisticRegression
from readdata import read_data
from sklearn import cross_validation
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

X_train,y_train = read_data("./Data/train")
X_train = X_train.toarray()
#convert labels to 0,1
idx = np.where(y_train==-1)
y_train[idx] = 0.
kf = cross_validation.KFold(y_train.shape[0], n_folds=10, indices=False)
X_test,y_test = read_data("./Data/test")
X_test = X_test.toarray()
idx = np.where(y_test==-1)
y_test[idx] = 0.
parameters = dict({
    "clf__weight_decay"      : [.001,.01,.1,1,10,100,1000],
    "clf__init_beta"         : [0.,0.00001,.0001,0.001,.01,.1,1.,10],
                  })

lr = LogisticRegression(learning="LBFGS",X_test=X_test,y_test=y_test)
pipeline = Pipeline([
                        ("clf"  ,lr),
                    ])
f = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,cv=3)
f.fit(X_train,y_train)
best_parameters = f.best_estimator_.get_params()
        # Don't need to remember the old paths
        path = newpath
    n = 0           # if only one element is observed max is sought in the initialization values
    if len(obs)!=1:
        n = t
    (prob, state) = max((V[n][y], y) for y in states)
    return (prob, path[state])

if __name__ == '__main__':
    try:
        datafile = sys.argv[1]
    except:
        datafile = 'AllDataWithNonHarmonics.csv'

    headers = ['module', 'root', 'bar_of_phrase', 'letter', 'bars_per_phrase', 'song_name']
    data = read_data(datafile, headers)
    transition_probs, emission_probs, initial_probs, states = get_probabilities(data)

    # print states
    # for one, two in emission_probs:
    #     print '{}->{}: {:.4f}'.format(one, two, emission_probs[(one, two)])

    # for state in initial_probs:
    #     print state, initial_probs[state]

    total_correct = 0
    total = 0
    for song in data:
        obs = [entry['root'] for entry in song]
        correct = [entry['module'].split('_')[0] for entry in song]
        prob, predictions = viterbi(obs, states, initial_probs, transition_probs, emission_probs)
        return chord_counts, transition_counts

    def get_transition_probs(chord_counts, transition_counts):
        """
        Returns a dictionary of transition probabilities based on counts for chords
        and transitions.

        """
        probs = dict(transition_counts) # make a copy so we don't destroy the counts dictionary
        for (first, second), count in transition_counts.items():
            probability = transition_counts[(first, second)] / chord_counts[first]
            probs[(first, second)] = probability
        return probs

    if __name__ == '__main__':
        try:
            datafile = sys.argv[1]
        except:
            datafile = 'AlldataWithNonHarmonics.csv'

        data = read_data(datafile)
        chord_counts, transition_counts = get_overall_counts(data)
        transition_probs = get_transition_probs(chord_counts, transition_counts)

        # map roman numerals to integers for sorting, and covert back to display
        transitions = [(RN.index(c1), RN.index(c2)) for c1, c2 in transition_probs]
        print '\n' +'Phrase Length = ' + z[j]

        for c1, c2 in sorted(transitions):
            print '({} -> {}): {:.4f}'.format(RN[c1], RN[c2], transition_probs[(RN[c1], RN[c2])])
#init_state =  np.tile(1e-5 * np.random.uniform(low=0.0,high=1.0,size=(n,n)),[batch_size,a_num,1,1])
loss_p = 0

batch_num_idx = range(batch_num)
k_fold = KFold(n_splits=10)
final_acc_fold = np.zeros((10, 1))
#CL = Chol_de(current_X,n)
#CC = Chol_com(CL,n,eps)

data = []
label = []

for idx in range(batch_num):
    print(idx)
    data_batch_in, label_batch_in = read_data(
        idx, '../../TTRNN/UCF11_updated_mpg/processed_data/', matrix_length,
        sample_rate)
    data.append(data_batch_in)
    label.append(label_batch_in)

with tf.Session() as sess:
    final_acc = 0.
    co = 0
    for tr_indices, ts_indices in k_fold.split(batch_num_idx):
        sess.run(tf.global_variables_initializer())
        print(
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.trainable_variables()
            ]))
        #start_time = time.time()
def read_data(filename):
    with open(filename, 'r') as csvf:
        return [row for row in csv.reader(csvf)]


def write_csv(data, filename):
    with open(filename, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for line in data:
            writer.writerow(line)


if __name__ == '__main__':
    try:
        datafile = sys.argv[1]
    except:
        datafile = 'new_cluster_chord_by_chord.csv'

    inputdata = read_data(datafile)
    outputdata = []
    uniquesongs = []
    header = ['song', 'artist', 'year', 'meter', 'cluster15','cluster6'] #add kmeans6 when ready
    outputdata.append(header)
    for line in inputdata:
        if line[0] not in uniquesongs:
            uniquesongs.append(line[0])
            songmeta = [line[0], line[1], line[2], line[3], line[8], line[9]] # add line[9] when kmeans6 is ready
            outputdata.append(songmeta)

    write_csv(outputdata, 'metadata_with_clusters.csv')
def evaluateDataset():
    """
    Returns
    -------
    clf : Classifier Model
        Evaluate and choose a classifier with the best suited options.
        Evaluation includes accuracy, F1 score, precision and recall. 
        The output lists the importance of the selected features. 

    """
    ### Add a couple features to this list.
    # from newDataPoint import createNewPoints
    # createNewPoints(data_dict)

    alldata = read_data('flightdelays-2010-2020.csv')

    key_features_list = [
        'arr_del15', 'carrier_ct', ' weather_ct', 'nas_ct', 'security_ct',
        'late_aircraft_ct'
    ]
    features_list = [
        'arr_del15', ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct'
    ]
    data = pd.DataFrame(alldata, columns=key_features_list)
    target = []

    data = data.dropna()

    ### Create a prediction target for late flights because of carrier delays.
    for e in data['carrier_ct']:
        if (e > 0):
            target.append(1)
        else:
            target.append(0)

    ### Remove carrier_ct from the list as we used that to create the
    ### target data for the predictions.
    ### carrier_ct = weather_ct + nas_ct + security_ct + late_aircraft_ct
    newdata = pd.DataFrame(data, columns=features_list)

    data = newdata

    ### Decision Tree
    from time import time
    from sklearn import tree
    clf = tree.DecisionTreeClassifier()
    t0 = time()
    clf.fit(data, target)
    print("training time for all data:", round(time() - t0, 3), "s")

    ### print accuracy
    print("Decision Tree Accuracy on All the data: ",
          round(clf.score(data, target), 3))

    from classifyDT import classifyDT

    from sklearn.model_selection import train_test_split
    features_train, features_test, labels_train, labels_test = \
        train_test_split(data, target, test_size=0.5, random_state=42)

    ### features_train and features_test are the features for the training
    ### and testing datasets, respectively
    ### labels_train and labels_test are the corresponding item labels
    # features_train, features_test, labels_train, labels_test = preprocess()

    clf = classifyDT(features_train, labels_train, features_test, labels_test)

    ### Determine the importance of the features that we chose.
    lat = [i for i in clf.feature_importances_]

    ### use the cut off 0.2 to determine the features of importance.
    #def condition(x): return x > 0.2
    def condition(x):
        return x

    output = [idx for idx, element in enumerate(lat) if condition(element)]
    print("output:", output)
    for i in output:
        print("importance: of ", features_list[i], " is ", round(lat[i], 3))

    return clf
Example #19
0
from matplotlib.pyplot import *
from numpy             import *
from Green_function    import Green_func
from readdata          import read_data

(rs,r0,L,kx,kt,ms,d,h3,h2,h1,rmax,Q,M,muq) = read_data()

#kx = [0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0]
#kx = [0.0,0.5,1.5,2.5,5.0,10.0]
kx = 3.0
kt = linspace(-2.0,2.0,81)
Scale = 1
Gr_f = zeros(len(kt),complex)
for i in range(0,len(kt)):
   Gr_f[i] = Green_func(rs,r0,L,kx,kt[i],ms,d,h3,h2,h1,rmax,Q,M,muq,Scale)
   print Gr_f[i]
print Scale
figure(1)
plot(kt,Gr_f.imag,label='Scale = %f'%Scale)
#######################################################################################
#Scale = 10
#Gr_f = zeros(len(kt),complex)
#for i in range(0,len(kt)):
#   Gr_f[i] = Green_func(rs,r0,L,kx,kt[i],ms,d,h3,h2,h1,rmax,Q,M,v,muq,Scale)
#   print Gr_f[i]
#print Scale
#figure(1)
#plot(kt,Gr_f.imag,label='Scale = %f'%Scale)

'''
######################################################################################
Example #20
0
def train(use_data,semi_sv,output,data_aug,epoch=1000):
    def get_subset(dataset,idx):
        data = {}
        for key,value in dataset.items():
            data[key] = value[idx]
        return data

    def concat_data(data1,data2):
        result = {}
        for k in data1.keys():
            result[k] = np.concatenate([data1[k],data2[k]])
        return result

    from readdata import read_data


    tr,te, embedding_matrix, labels = read_data(use_data,data_aug=data_aug)

    print(use_data)
    print('Shape of label tensor:', labels.shape)

    y = labels

    from config import model_path
    from sklearn.cross_validation import StratifiedKFold, KFold
    from config import n_folds

    y_pred = pd.read_csv("./data/y_pred.csv")['y_pre'].values
    y_pos_ = y_pred == 1
    y_neg_ = y_pred == 0
    add_idx = np.any([y_pos_, y_neg_], axis=0)
    add_y = y_pred[add_idx]


    y_pos = y_pred > 0.75
    y_neg = y_pred < 0.25
    y_idx = np.any([y_pos, y_neg], axis=0)
    y_pred = y_pred[y_idx]
    print(y_idx.shape)


    folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
    result = np.zeros((len(te['q1']), 1))

    oof_y = np.zeros((len(y), 1))
    for n_fold, (tr_idx, val_idx) in enumerate(folds):
        tr_x = get_subset(tr,tr_idx)


        if semi_sv:
            te_x = get_subset(te, y_idx)
            tr_data = concat_data(tr_x,te_x)
            tr_y = np.concatenate([y[tr_idx],y_pred])
        else:
            add_data = get_subset(te,add_idx)
            tr_data = concat_data(tr_x,add_data)
            tr_y = np.concatenate([y[tr_idx], add_y])
            # tr_data = tr_x
            # tr_y = y[tr_idx]

        val_x = get_subset(tr, val_idx)
        val_y = y[val_idx]

        use_word = True
        if use_data!='words':
            use_word = False
        model = get_model(word_embedding_matrix=embedding_matrix,use_word=use_word)
        if n_fold == 0:
            print(model.summary())

        hist = epochHistory()
        print(n_fold)
        model.fit(tr_data,
                  tr_y,
                  epochs=epoch,
                  validation_data=[val_x,val_y],
                  verbose=1,
                  batch_size=256,
                  callbacks=[
                      EarlyStopping(patience=2, monitor='val_binary_crossentropy'),
                      # LearningRateScheduler(lr_de,verbose=1)
                      hist,
                      ModelCheckpoint('./weight/weights.{epoch:d}.hdf5',monitor='val_binary_crossentropy',save_weights_only=True)
                  ])
        result += iter_ense(hist.epochs,model,te)
        # result += model.predict(te, batch_size=1024)

        oof_y[val_idx] = model.predict(val_x, batch_size=2048)

        K.clear_session()
        tf.reset_default_graph()

    # 提交结果
    result /= n_folds
    submit = pd.DataFrame()
    submit['y_pre'] = list(result[:, 0])
    submit.to_csv(output, index=False)


    ## 保存预测的训练标签
    # oof_y = oof_y[:,0]
    # oof_y_ = oof_y.round().astype(int)
    #
    # error_idx = oof_y_!=y
    # print(np.sum(error_idx))
    # oof_y[error_idx] = 1-oof_y[error_idx]
    submit = pd.DataFrame()
    submit['y_pre'] = oof_y[:,0]
    submit.to_csv('./data/oofy.csv',index=False)
Example #21
0
#! /usr/bin/python

from numpy             import *
from pylab             import *
from matplotlib.pyplot import *
from matplotlib        import rc
from cmath             import exp, cos, sin
from curvfit           import *
from readdata          import read_data
from RK_solver         import RK4_solver, Adaptive_RK4_solver

#Constants
(rs,r0,L,kx,kt,ms,d,h3,h2,h1,rmax,Q,M,v,muq) = read_data('input.xml')
#k   = 1.0
Scale = 1
mm    = int((rmax-r0)*Scale/5)
Delta = d/2 + sqrt(d**2/4 + ms)
pow_r = [-Delta,Delta-d]

[r,fpr,fnr] = RK4_solver(rs,r0,L,kx,kt,ms,d,h3,h2,h1,rmax,Q,M,v,muq)

fp_real = zeros(len(fpr))
for i in range(0, len(fpr)):
   fp_real[i] = fpr[i].real

fp_imag = zeros(len(fpr))
for i in range(0, len(fpr)):
   fp_imag[i] = fpr[i].imag

for i in range(0, len(r)):
   r[i] = r[i]-r0
Example #22
0
#! /usr/bin/python

from numpy import *
from pylab import *
from matplotlib.pyplot import *
from matplotlib import rc
from cmath import exp, cos, sin
from curvfit import *
from readdata import read_data
from RK_solver import RK4_solver, Adaptive_RK4_solver

#Constants
(rs, r0, R2, kx, kt, Rm2, d, h3, h2, h1, rmax, Q, M, v,
 muq) = read_data('input.xml')
#k   = 1.0
Scale = 1
mm = int((rmax - r0) * Scale / 5)
Delta = d / 2 + sqrt(d**2 / 4 + Rm2)
pow_r = [-Delta, Delta - d]

err = 1e-9
[r, fr, tot_err] = Adaptive_RK4_solver(rs, r0, R2, kx, kt, Rm2, d, rmax, Q, M,
                                       v, muq, err)
print "Total Error is ", tot_err

f_real = zeros(len(fr))
for i in range(0, len(fr)):
    f_real[i] = fr[i].real

f_imag = zeros(len(fr))
for i in range(0, len(fr)):
def main():
    # Set up the database of objects
    X = readdata.read_data(infl)
    # Choose initial means with K-means
    means = ChooseInitialMeans(X)
    # Set up initial clusters
    distmat = SetDistMat(X, means)
    clusters = InitialAssignment(distmat)
    ## debug code
    #keys = sorted(clusters.keys())
    #for key in keys:
    #    print("cluster %i:"%key)
    #    print(clusters[key])
    ## end of debug
    # Iteration step
    for iter in range(max_iter):
        active = 0  # indicate the number of transfers in the current iteration
        tranlst = (-1) * np.ones(
            k, dtype='int')  # set up transfer list for each cluster
        # Compute the cluster means
        oldmeans = means.copy()
        means = CalcMeans(X, oldmeans, clusters)
        # Get statistics about the clustering
        #ClusterStat(X, means, clusters)
        ## debug code
        #print("old means:")
        #print(oldmeans)
        #print("new means:")
        #print(means)
        ## end of debug
        # For each object, compute the distances to the cluster means
        distmat = SetDistMat(X, means)
        # Sort objects based on the delta of the current assignment and the best
        # possible alternate assignment
        objlst = SortObj(X, clusters, means, distmat)
        ##debug code
        #print(objlst)
        ##return
        #end of debug
        # For each element by prioty:
        while (len(objlst)):
            (i, key, temp) = objlst.pop()
            obj2key = GetDist(X[i], means[key])
            transferred = False  #record if any transfering has occured to i
            if (key == distmat[i, 0][0]):
                ##debug
                #print("%i is already the opt cluster for obj %i. no transfer"%(clu, i))
                ##end of debug
                continue
            # For each other clusters by element gain:
            else:
                for j in range(k):
                    clu = distmat[i, j][0]  # the key of another cluster
                    objgain = obj2key - distmat[i, j][
                        1]  # gain by transfering i from cluster key to clu
                    if (clu == key):  # already in the cluster
                        continue
                    if (len(clusters[clu]) < cluster_size):
                        active += 1
                        transferred = True
                        clusters = Transfer(i, key, clu, clusters)
                        ##debug
                        #print("cluster %i not full. transfer obj %i from cluster %i to it."%(clu, i, key))
                        ##end of debug
                        break
                    elif (tranlst[clu] != -1
                          ):  # if the tranlst of another cluster is not empty
                        # distance between the obj in the tranlst and the current cluster
                        tran2key = GetDist(X[tranlst[clu]], means[key])
                        tran2clu = GetDist(X[tranlst[clu]], means[clu])
                        # gain by transfering the obj in tranlst from cluster clu to key
                        trangain = tran2clu - tran2key
                        if (
                                objgain + trangain > 0
                        ):  # transfer if the sum of gains are positive, ie net gain
                            active += 2
                            transferred = True
                            clusters = Transfer(i, key, clu, clusters)
                            clusters = Transfer(tranlst[clu], clu, key,
                                                clusters)
                            ##debug
                            #print("obj %i is transfered from cluster %i to %i"%(i, key, clu))
                            #print("obj %i is transfered from cluster %i to %i"%(tranlst[clu], clu, key))
                            #print("objgain: %f, trangain: %f"%(objgain, trangain))
                            ##end of debug
                            tranlst[clu] = -1  # reset the tranlst to empty
                            break
                if (not transferred):
                    tranlst[key] = i
                    ##debug
                    #print("add obj %i in cluster %i to the transfer list"%(i, key))
                    ##end of debug
        # nothing is transferred during this iteration, return the clustering result
        if (not active):
            break
        #debug code
        print("number of transfers in iter %i: %i\n" % (iter + 1, active))
        #end of debug
    print("K-means clustering converged in %d iterations!\n" % (iter + 1))
    # Output the clustering results
    WriteResult(outfl, X, means, clusters)
    ClusterStat(X, means, clusters)
    return (0)
# -*- coding: utf-8 -*-
"""
Created by Maya May 9, 2021
"""
import pandas as pd
from readdata import read_data
alldata = read_data('flightdelays-2010-2020.csv')


def evaluateDataset():
    """
    Returns
    -------
    clf : Classifier Model
        Evaluate and choose a classifier with the best suited options.
        Evaluation includes accuracy, F1 score, precision and recall. 
        The output lists the importance of the selected features. 

    """
    ### Add a couple features to this list.
    # from newDataPoint import createNewPoints
    # createNewPoints(data_dict)

    alldata = read_data('flightdelays-2010-2020.csv')

    key_features_list = [
        'arr_del15', 'carrier_ct', ' weather_ct', 'nas_ct', 'security_ct',
        'late_aircraft_ct'
    ]
    features_list = [
        'arr_del15', ' weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct'
def main():
    # Set up the database of objects
    X = readdata.read_data(infl)
    # Choose initial means with K-means
    means = ChooseInitialMeans(X)
    # Set up initial clusters
    distmat = SetDistMat(X, means) 
    clusters = InitialAssignment(distmat) 
    ## debug code
    #keys = sorted(clusters.keys())
    #for key in keys:
    #    print("cluster %i:"%key)
    #    print(clusters[key])
    ## end of debug
    # Iteration step
    for iter in range(max_iter):
        active = 0 # indicate the number of transfers in the current iteration
        tranlst = (-1)*np.ones(k, dtype='int') # set up transfer list for each cluster
        # Compute the cluster means
        oldmeans = means.copy()
        means = CalcMeans(X, oldmeans, clusters)
        # Get statistics about the clustering
        #ClusterStat(X, means, clusters)
        ## debug code
        #print("old means:")
        #print(oldmeans)
        #print("new means:")
        #print(means)
        ## end of debug
        # For each object, compute the distances to the cluster means
        distmat = SetDistMat(X, means)
        # Sort objects based on the delta of the current assignment and the best 
        # possible alternate assignment
        objlst = SortObj(X, clusters, means, distmat)
        ##debug code
        #print(objlst)
        ##return
        #end of debug
        # For each element by prioty:
        while (len(objlst)):
            (i, key, temp) = objlst.pop()
            obj2key = GetDist(X[i], means[key])
            transferred = False #record if any transfering has occured to i 
            if (key == distmat[i,0][0]):
                ##debug
                #print("%i is already the opt cluster for obj %i. no transfer"%(clu, i))
                ##end of debug
                continue
            # For each other clusters by element gain:
            else:
                for j in range(k):
                    clu = distmat[i,j][0] # the key of another cluster
                    objgain = obj2key - distmat[i,j][1] # gain by transfering i from cluster key to clu
                    if (clu==key): # already in the cluster
                        continue
                    if (len(clusters[clu]) < cluster_size):
                        active += 1
                        transferred = True
                        clusters = Transfer(i, key, clu, clusters)
                        ##debug
                        #print("cluster %i not full. transfer obj %i from cluster %i to it."%(clu, i, key))
                        ##end of debug
                        break
                    elif (tranlst[clu] != -1): # if the tranlst of another cluster is not empty
                        # distance between the obj in the tranlst and the current cluster
                        tran2key = GetDist(X[tranlst[clu]], means[key])
                        tran2clu = GetDist(X[tranlst[clu]], means[clu])
                        # gain by transfering the obj in tranlst from cluster clu to key
                        trangain = tran2clu - tran2key
                        if (objgain + trangain > 0): # transfer if the sum of gains are positive, ie net gain
                            active += 2
                            transferred = True
                            clusters = Transfer(i, key, clu, clusters)
                            clusters = Transfer(tranlst[clu], clu, key, clusters)
                            ##debug
                            #print("obj %i is transfered from cluster %i to %i"%(i, key, clu))
                            #print("obj %i is transfered from cluster %i to %i"%(tranlst[clu], clu, key))
                            #print("objgain: %f, trangain: %f"%(objgain, trangain))
                            ##end of debug
                            tranlst[clu] = -1 # reset the tranlst to empty
                            break
                if (not transferred):
                    tranlst[key] = i
                    ##debug
                    #print("add obj %i in cluster %i to the transfer list"%(i, key))
                    ##end of debug
        # nothing is transferred during this iteration, return the clustering result
        if (not active):
                break
        #debug code
        print("number of transfers in iter %i: %i\n"%(iter+1, active))
        #end of debug
    print("K-means clustering converged in %d iterations!\n"%(iter+1))
    # Output the clustering results
    WriteResult(outfl, X, means, clusters)
    ClusterStat(X, means, clusters)
    return(0)
Example #26
0
#! /usr/bin/python

from numpy             import *
from pylab             import *
from matplotlib.pyplot import *
from matplotlib        import rc
from cmath             import exp, cos, sin
from curvfit           import *
from readdata          import read_data
from RK_solver         import RK4_solver, Adaptive_RK4_solver

#Constants
(rs,r0,R2,kx,kt,Rm2,d,h3,h2,h1,rmax,Q,M,v,muq) = read_data('input.xml')
#k   = 1.0
Scale = 1
mm    = int((rmax-r0)*Scale/5)
Delta = d/2 + sqrt(d**2/4 + Rm2)
pow_r = [-Delta,Delta-d]

err = 1e-9
[r,fr,tot_err]= Adaptive_RK4_solver(rs,r0,R2,kx,kt,Rm2,d,rmax,Q,M,v,muq,err)
print "Total Error is " , tot_err

f_real = zeros(len(fr))
for i in range(0, len(fr)):
   f_real[i] = fr[i].real

f_imag = zeros(len(fr))
for i in range(0, len(fr)):
   f_imag[i] = fr[i].imag
Example #27
0
    pre_train_epoch = 5000
    epoch_num = 500  #50
    depth = len(out_channels)
    assert len(in_channels) == len(middle_channels) and len(
        middle_channels) == len(out_channels)

    k = 3
    d0 = 1
    ############# pre-process data part
    label_CSV = "data/ad_data/APOE.csv"
    label_name = "APOE"
    class_num = 2  ########
    group_test_times = 5000
    ################

    _, _, Label = read_data(label_CSV, label_name, recalculate=False)

    NagPos = np.where(Label)
    PosPos = np.where(1 - Label)

    Length, Nampyte = load_length("./data/ad_data/processed_data/" +
                                  label_name + "/Track_info.txt")

    Trackids = [int(sys.argv[1])]  ###### all fibers
    Track_num = len(Trackids)

    matrix_length_all = np.zeros(Track_num, dtype=np.int32)
    for i in range(Track_num):
        matrix_length_all[i] = Length[Trackids[i]][1]

    matrix_length = np.sum(matrix_length_all)
Example #28
0
from matplotlib.pyplot import *
from numpy import *
from Green_function import Green_func
from readdata import read_data

(rs, r0, L, kx, kt, ms, d, h3, h2, h1, rmax, Q, M, muq) = read_data()

#kx = [0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0]
#kx = [0.0,0.5,1.5,2.5,5.0,10.0]
kx = 3.0
kt = linspace(-2.0, 2.0, 81)
Scale = 1
Gr_f = zeros(len(kt), complex)
for i in range(0, len(kt)):
    Gr_f[i] = Green_func(rs, r0, L, kx, kt[i], ms, d, h3, h2, h1, rmax, Q, M,
                         muq, Scale)
    print Gr_f[i]
print Scale
figure(1)
plot(kt, Gr_f.imag, label='Scale = %f' % Scale)
#######################################################################################
#Scale = 10
#Gr_f = zeros(len(kt),complex)
#for i in range(0,len(kt)):
#   Gr_f[i] = Green_func(rs,r0,L,kx,kt[i],ms,d,h3,h2,h1,rmax,Q,M,v,muq,Scale)
#   print Gr_f[i]
#print Scale
#figure(1)
#plot(kt,Gr_f.imag,label='Scale = %f'%Scale)
'''
######################################################################################
Example #29
0
def train_wc(semi_sv,output,epoch=1000):
    from readdata import read_data

    tr_q1, tr_q2, te_q1, te_q2, word_embedding_matrix, labels = read_data('words')
    trc_q1, trc_q2, tec_q1, tec_q2, char_embedding_matrix, labels = read_data('chars')

    X = {
        'q1': tr_q1,
        'q2': tr_q2,
        'qc1': trc_q1,
        'qc2': trc_q2
    }
    y = labels


    from config import model_path
    from sklearn.cross_validation import StratifiedKFold, KFold
    from config import n_folds
    from nn import aggmodel

    y_pred = pd.read_csv("./data/y_pred.csv")['y_pre'].values
    y_pos = y_pred > 0.75
    y_neg = y_pred < 0.25
    y_idx = np.any([y_pos, y_neg], axis=0)
    y_pred = y_pred[y_idx]
    print(y_idx.shape)

    # oof_y = np.zeros((len(X['q1']),1))
    oof_y = pd.read_csv("./data/oofy.csv")['y_pre'].values
    alpha = 1
    oof_y = (1 - alpha) * y + alpha * oof_y


    folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True,)
    result = np.zeros((len(te_q1), 1))

    for n_fold, (tr_idx, val_idx) in enumerate(folds):
        if semi_sv:
            Q1_tr = np.concatenate([X['q1'][tr_idx], te_q1[y_idx]])
            Q2_tr = np.concatenate([X['q2'][tr_idx], te_q2[y_idx]])
            Qc1_tr = np.concatenate([X['qc1'][tr_idx],tec_q1[y_idx]])
            Qc2_tr = np.concatenate([X['qc2'][tr_idx],tec_q2[y_idx]])
            y_tr = np.concatenate([y[tr_idx], y_pred])
            # y_tr = np.concatenate([oof_y[tr_idx], y_pred])

            idx = list(range(len(y_tr)))
            np.random.shuffle(idx)
            Q1_tr = Q1_tr[idx]
            Q2_tr = Q2_tr[idx]
            Qc1_tr = Qc1_tr[idx]
            Qc2_tr = Qc2_tr[idx]
            y_tr = y_tr[idx]
        else:
            Q1_tr = X['q1'][tr_idx]
            Q2_tr = X['q2'][tr_idx]
            Qc1_tr = X['qc1'][tr_idx]
            Qc2_tr = X['qc2'][tr_idx]
            y_tr = y[tr_idx]
            # y_tr = oof_y[tr_idx]



        Q1_te = X['q1'][val_idx]
        Q2_te = X['q2'][val_idx]
        Qc1_te = X['qc1'][val_idx]
        Qc2_te = X['qc2'][val_idx]
        y_te = y[val_idx]

        model = aggmodel(word_embedding_matrix,char_embedding_matrix)
        if n_fold == 0:
            print(model.summary())
        print(n_fold)
        model.fit([Q1_tr, Q2_tr,Qc1_tr,Qc2_tr],
                  y_tr,
                  epochs=epoch,
                  validation_data=[[Q1_te, Q2_te,Qc1_te, Qc2_te], y_te],
                  verbose=1,
                  batch_size=256,
                  callbacks=[
                      EarlyStopping(patience=3, monitor='val_binary_crossentropy'),
                      # LearningRateScheduler(lr_de,verbose=1)
                  ], )
        # model.load_weights(model_path)

        result += model.predict([te_q1,te_q2,tec_q1,tec_q2], batch_size=1024)


        # 释放显存
        K.clear_session()
        tf.reset_default_graph()

    # 提交结果
    result /= n_folds
    submit = pd.DataFrame()
    submit['y_pre'] = list(result[:, 0])
    submit.to_csv(output, index=False)
Example #30
0
    K.clear_session()
    tf.reset_default_graph()

    submit = 0
    total_w = 0
    for y_pred,ense_w in results:
        submit += ense_w*y_pred
        total_w += ense_w

    return submit/total_w



if __name__ == '__main__':
    from readdata import read_data

    _, te_word, embedding_matrix_word,__ = read_data('words', data_aug=False)
    _, te_char, embedding_matrix_char,__ = read_data('chars', data_aug=False)

    submit_atten = ensemble('esim',te_word,te_char,embedding_matrix_word,embedding_matrix_char)

    submit = pd.DataFrame()
    submit['y_pre'] = list(submit_atten[:, 0])
    submit.to_csv('atten.csv', index=False)





Example #31
0
def train(use_data, semi_sv, output, data_aug, use_model):
    def get_subset(dataset, idx):
        data = {}
        for key, value in dataset.items():
            data[key] = value[idx]
        return data

    def concat_data(data1, data2):
        result = {}
        for k in data1.keys():
            result[k] = np.concatenate([data1[k], data2[k]])
        return result

    def get_aug_data(tr_x, tr_y):
        tr_q1 = tr_x['q1']
        tr_q2 = tr_x['q2']
        tr_gf = tr_x['gf']
        tr_q1node = tr_x['q1node']
        tr_q2node = tr_x['q2node']

        res_q1 = []
        res_q2 = []
        res_gf = []
        res_q1node = []
        res_q2node = []
        res_y = []

        for q1, q2, gf, q1node, q2node, y in zip(tr_q1, tr_q2, tr_gf,
                                                 tr_q1node, tr_q2node, tr_y):
            r1 = q1[np.in1d(q1, q2, invert=True)]
            len1 = len(r1)
            if len1 < 4 or len1 == len(q1[q1 != 0]):
                continue

            r2 = q2[np.in1d(q2, q1, invert=True)]
            len2 = len(r2)
            if len2 < 4 or len2 == len(q2[q2 != 0]):
                continue

            out1 = np.zeros(15, dtype=np.int32)
            out2 = np.zeros(15, dtype=np.int32)
            out1[-len1:] = r1
            out2[-len2:] = r2

            res_q1.append(out1)
            res_q2.append(out2)
            res_gf.append(gf)
            res_q1node.append(q1node)
            res_q2node.append(q2node)
            res_y.append(y)

        res_x = {
            'q1': np.asarray(res_q1),
            'q2': np.asarray(res_q2),
            'gf': np.asarray(res_gf),
            'q1node': np.asarray(res_q1node),
            'q2node': np.asarray(res_q2node)
        }
        res_y = np.asarray(res_y)
        return res_x, res_y

    from nn import rnnword, aggmodel, esim, attention, rnn_res
    if use_model == 'rnnword':
        get_model = rnnword
    elif use_model == 'aggmodel':
        pass
    elif use_model == 'esim':
        get_model = esim
    elif use_model == 'attention':
        get_model = attention
    elif use_model == 'res':
        get_model = rnn_res
    else:
        raise RuntimeError("don't have this model")

    from readdata import read_data

    model_name = datetime.datetime.now().strftime(
        '%Y-%m-%d_%H:%M:%S') + '_' + use_data + '_' + str(semi_sv) + '_' + str(
            data_aug) + '_'

    tr, te, embedding_matrix, labels = read_data(use_data, data_aug=data_aug)

    print(use_data)
    print('Shape of label tensor:', labels.shape)

    y = labels

    from config import model_path
    from sklearn.cross_validation import StratifiedKFold, KFold
    from config import n_folds

    y_pred = pd.read_csv("./data/y_pred.csv")['y_pre'].values
    y_pos_ = y_pred == 1
    y_neg_ = y_pred == 0
    add_idx = np.any([y_pos_, y_neg_], axis=0)
    add_y = y_pred[add_idx]

    y_pos = y_pred > 0.75
    y_neg = y_pred < 0.25
    y_idx = np.any([y_pos, y_neg], axis=0)
    y_pred = y_pred[y_idx]
    print(y_idx.shape)

    folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
    result = np.zeros((len(te['q1']), 1))

    oof_y = np.zeros((len(y), 1))
    for n_fold, (tr_idx, val_idx) in enumerate(folds):
        tr_x = get_subset(tr, tr_idx)
        tr_y = y[tr_idx]
        # if data_aug:
        #     res_x,res_y = get_aug_data(tr_x,tr_y)
        #     tr_x = concat_data(tr_x,res_x)
        #     tr_y = np.concatenate([tr_y,res_y])

        if semi_sv:
            te_x = get_subset(te, y_idx)
            tr_data = concat_data(tr_x, te_x)
            tr_y = np.concatenate([tr_y, y_pred])
            patience = 3
        else:
            add_data = get_subset(te, add_idx)
            tr_data = concat_data(tr_x, add_data)
            tr_y = np.concatenate([tr_y, add_y])
            patience = 2
            # tr_data = tr_x
            # tr_y = y[tr_idx]

        val_x = get_subset(tr, val_idx)
        val_y = y[val_idx]

        use_word = True
        if use_data != 'words':
            use_word = False
        model = get_model(word_embedding_matrix=embedding_matrix,
                          use_word=use_word)
        if n_fold == 0:
            print(model.summary())

        # hist = epochHistory()
        print(n_fold)
        model.fit(
            tr_data,
            tr_y,
            epochs=1000,
            validation_data=[val_x, val_y],
            verbose=1,
            batch_size=256,
            callbacks=[
                EarlyStopping(patience=patience,
                              monitor='val_binary_crossentropy'),
                # LearningRateScheduler(lr_de,verbose=1)
                # hist,
                # ModelCheckpoint('./weight/weights.{epoch:d}.hdf5',monitor='val_binary_crossentropy',save_weights_only=True)
            ])
        # result += iter_ense(hist.epochs,model,te)
        result += model.predict(te, batch_size=1024)

        model.save_weights('./weight/' + model_name + str(n_fold) + '.h5')
        # oof_y[val_idx] = model.predict(val_x, batch_size=2048)

        K.clear_session()
        tf.reset_default_graph()

    # 提交结果
    result /= n_folds
    submit = pd.DataFrame()
    submit['y_pre'] = list(result[:, 0])
    submit.to_csv(output, index=False)

    ## 保存预测的训练标签
    # oof_y = oof_y[:,0]
    # oof_y_ = oof_y.round().astype(int)
    #
    # error_idx = oof_y_!=y
    # print(np.sum(error_idx))
    # oof_y[error_idx] = 1-oof_y[error_idx]
    submit = pd.DataFrame()
    submit['y_pre'] = oof_y[:, 0]
    submit.to_csv('./data/oofy.csv', index=False)