コード例 #1
0
ファイル: main.py プロジェクト: yinliu13/MarketBasket
def run():
    embedding_wrapper = EmbeddingWrapper('product')
    bc = BasketConstructor('./data/', './data/')
    ub_basket = bc.get_baskets('prior', reconstruct=False)
    ok, ub_basket = train_test_split(ub_basket, test_size=0.20, random_state=0)
    #embedding_wrapper = EmbeddingWrapper('tafeng_products')
    print(ub_basket)

    all_baskets = ub_basket.basket.values
    print(all_baskets)
    #changes every item to string
    print("nested change")
    all_baskets = nested_change(list(all_baskets), str)
    print("embedding_wrapper.remove_products_wo_embeddings(all_baskets)")
    all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets)
    print("uncommon products")
    all_baskets = remove_products_which_are_uncommon(all_baskets)
    print("short baskets")
    medium_baskets, all_baskets = remove_short_baskets(all_baskets)
    print(medium_baskets , all_baskets)
    print("nested change")
    all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f)
    print("split_data")
    train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(all_baskets)
    print('knndtw')
    knndtw = KnnDtw(n_neighbors=[5])
    preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, 
                                          embedding_wrapper.basket_dist_REMD)
    print(preds_all)
    print(distances)
    #print("Wasserstein distance", sum(distances)/len(distances))
    return preds_all, distances
コード例 #2
0
ファイル: reduce_kNN.py プロジェクト: anishpurohit/dropbox
def fit_clean(data):
    '''
        Fit the model without any principal components
    '''
    # split the data into training and testing
    train_x, train_y, \
    test_x,  test_y, \
    labels = hlp.split_data(
        data,
        y = 'credit_application'
    )

    # fit the model and print the summary
    class_fit_predict_print((train_x, train_y, test_x, test_y))
コード例 #3
0
    def __init__(self):

        with open("helper/dictionary.pickle", "rb") as f:
            dictionary = pickle.load(f)

        vocab_size = len(dictionary)

        # Load data -> change to mongoDb Connector later
        data = pd.read_csv("data_csv/data")
        y = data.point

        with open("features/tf_idf_vectorizer.pickle", "rb") as f:
            vectorizer = pickle.load(f)

        corpus = data[['project', 'concat']]

        analyzer = vectorizer.build_analyzer()

        corpus['concat'] = corpus.apply(lambda x: analyzer(x[1]), axis=1)

        tmp = corpus.apply(lambda x: len(x[1]), axis=1)

        sentence_size = max(tmp)

        corpus['concat'] = corpus.apply(
            lambda x: [dictionary.get(i) for i in x[1]], axis=1)

        x_train, x_test, y_train, y_test = split_data(corpus, y, ratio=0.2)

        pad_id = 0

        x_train = sequence.pad_sequences(x_train['concat'].values,
                                         maxlen=sentence_size,
                                         truncating='post',
                                         padding='post',
                                         value=pad_id)

        x_test = sequence.pad_sequences(x_test['concat'].values,
                                        maxlen=sentence_size,
                                        truncating='post',
                                        padding='post',
                                        value=pad_id)

        self.vocab_size = vocab_size
        self.sentence_size = sentence_size
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
コード例 #4
0
def lightGBMmodel(size, x):
    data_csv = pd.read_csv("data_csv/data", low_memory=False)

    #x = sparse.load_npz("features/tf_idf_matrix.npz")

    #    x = pd.read_csv("features/word2vec_ave_"+str(size)+".csv",index_col=0,low_memory=False)

    #x = pd.read_csv("features/doc2vec.csv",index_col=0)

    y = data_csv.point

    x_train, x_test, y_train, y_test = split_data(x, y, ratio=0.2)

    lgbm = LGBMRegressor(random_state=99)
    lgbm.fit(x_train.iloc[:, :-1], y_train)

    y_pred = lgbm.predict(x_test.iloc[:, :-1])

    result_total = dict()
    result_total['MSE'] = mean_squared_error(y_pred, y_test)
    result_total['MAE'] = mean_absolute_error(y_pred, y_test)
    result_total['MdAE'] = median_absolute_error(y_pred, y_test)

    print("Mean Absolute Error: ", mean_absolute_error(y_pred, y_test))
    print("Median Absolute Error: ", median_absolute_error(y_pred, y_test))
    print("Mean Squared Error: ", mean_squared_error(y_pred, y_test))

    print("Evaluation on each project")
    tmp = pd.DataFrame()
    tmp['project'] = x_test['project']
    tmp['pred'] = y_pred
    tmp['truth'] = y_test

    result_each = tmp.groupby(by='project').apply(
        lambda col: mean_squared_error(col.pred, col.truth)).to_frame(
            name='MSE')
    result_each['MAE'] = tmp.groupby(by='project').apply(
        lambda col: mean_absolute_error(col.pred, col.truth))
    result_each['MdAE'] = tmp.groupby(by='project').apply(
        lambda col: median_absolute_error(col.pred, col.truth))

    return (result_each, result_total)
コード例 #5
0
ファイル: main.py プロジェクト: MathiasKraus/MarketBasket
def run():
    embedding_wrapper = EmbeddingWrapper('product')
    bc = BasketConstructor('./data/', './data/')
    ub_basket = bc.get_baskets('prior', reconstruct=False)

    all_baskets = ub_basket.basket.values
    all_baskets = nested_change(list(all_baskets), str)

    all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets)
    all_baskets = remove_products_which_are_uncommon(all_baskets)
    all_baskets = remove_short_baskets(all_baskets)
    all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f)

    train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(
        all_baskets)

    knndtw = KnnDtw(n_neighbors=[5])
    preds_all, distances = knndtw.predict(train_ub, val_ub_input,
                                          embedding_wrapper.basket_dist_EMD,
                                          embedding_wrapper.basket_dist_REMD)
    return preds_all, distances
コード例 #6
0
ファイル: reduce_kNN.py プロジェクト: anishpurohit/dropbox
def prepare_data(data, principal_components, n_components):
    '''
        Prepare the data for the classification
    '''
    # prepare the column names
    cols = ['pc' + str(i) for i in range(0, n_components)]

    # concatenate the data
    data = pd.concat(
        [data, pd.DataFrame(principal_components, columns=cols)],
        axis=1,
        join_axes=[data.index])

    # split the data into training and testing
    train_x, train_y, \
    test_x,  test_y, \
    labels = hlp.split_data(
        data,
        y = 'credit_application',
        x = cols
    )

    return (train_x, train_y, test_x, test_y)
コード例 #7
0
import pandas as pd
import sklearn.svm as sv
import helper as hp


@hp.timeit
def fitsvm(data):
    svm = sv.SVC(kernel='rbf', C=20.0, gamma=0.1)
    return svm.fit(data[0], data[1])


csv_data = pd.read_csv(
    "C:/Users/DELL/Desktop/niit/3rd semester/bank_contacts.csv")
train_x, train_y, test_x, test_y, labels = hp.split_data(
    csv_data, y='credit_application')
classifier = fitsvm((train_x, train_y))
predicted = classifier.predict(test_x)
hp.printModelSummary(test_y, predicted)
print(classifier.support_vectors_)

import pandas as pd
import sklearn.svm as sv
import helper as hp
"""
@ is the decorator to call a function from refered module
"""


@hp.timeit
def fitSVM(data):
    #creating the classifier object
コード例 #8
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
        
        for task_id in train_data.keys():
            print "task_id:", task_id

        # split data for training weak_learners and boosting
        (train_weak, train_boosting) = split_data(train_data, 4)
        
        # train on first part of dataset (evaluate on other)
        prepared_data_weak = PreparedMultitaskData(train_weak, shuffle=False)
        classifiers = self._inner_train(prepared_data_weak, param)

        # train on entire dataset
        prepared_data_final = PreparedMultitaskData(train_data, shuffle=False)
        final_classifiers = self._inner_train(prepared_data_final, param)


        print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
        print "done training weak learners"

        #####################################################
        #    perform boosting and wrap things up    
        #####################################################

        # wrap up predictors for later use
        predictors = {}

        for task_name in train_boosting.keys():
            
            instances = train_boosting[task_name]
            
            N = len(instances)
            F = len(classifiers)
            
            examples = [inst.example for inst in instances]
            labels = [inst.label for inst in instances]
            
            # dim = (F x N)
            out = cvxmod.zeros((N,F))
            
            for i in xrange(F):
                    
                svm = classifiers[i]
                        
                tmp_out = self._predict_weak(svm, examples, prepared_data_weak.name_to_id(task_name), param)

                if param.flags["signum"]:
                    out[:,i] = numpy.sign(tmp_out)
                else:
                    out[:,i] = tmp_out
            
            
            if param.flags["boosting"] == "ones":
                weights = numpy.ones(F)/float(F)
            if param.flags["boosting"] == "L1":
                weights = solve_boosting(out, labels, param.transform, solver="glpk")
            if param.flags["boosting"] == "L2":            
                weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=False)
            if param.flags["boosting"] == "L2_reg":            
                weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=True)
            
            
            predictors[task_name] = (final_classifiers, weights, prepared_data_final.name_to_id(task_name), param)
            
            
            assert prepared_data_final.name_to_id(task_name)==prepared_data_weak.name_to_id(task_name), "name mappings don't match"
            
        
        #####################################################
        #    Some sanity checks
        ##################################################### 
        
        # make sure we have the same keys (potentiall in a different order)  
        sym_diff_keys = set(train_weak.keys()).symmetric_difference(set(predictors.keys()))
        assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys)  


        return predictors
コード例 #9
0
                                  validationProportion=0.25)

    # and return the regressor
    return ann


# the file name of the dataset
r_filename = '../../Data/Chapter06/power_plant_dataset_pc.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split the data into training and testing
train_x, train_y, \
test_x,  test_y, \
labels = hlp.split_data(csv_read,
    y='net_generation_MWh', x=['total_fuel_cons_mmbtu'])

# create the ANN training and testing datasets
training = hlp.prepareANNDataset((train_x, train_y), prob='regression')
testing = hlp.prepareANNDataset((test_x, test_y), prob='regression')

# train the model
regressor = fitANN(training)

# predict the output from the unseen data
predicted = regressor.activateOnDataset(testing)

# and calculate the R^2
score = hlp.get_score(test_y, predicted[:, 0])
print('R2: ', score)
コード例 #10
0
    # fit the data
    return logistic_classifier.fit()


# the file name of the dataset
r_filename = '../../Data/Chapter03/bank_contacts.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split the data into training and testing
train_x, train_y, \
test_x,  test_y, \
labels = hlp.split_data(
    csv_read,
    y = 'credit_application'
)

# train the model
classifier = fitLogisticRegression((train_x, train_y))

# classify the unseen data
predicted = classifier.predict(test_x)

# assign the class
predicted = [1 if elem > 0.5 else 0 for elem in predicted]

# print out the results
hlp.printModelSummary(test_y, predicted)

# print out the parameters
@hlp.timeit
def fitNaiveBayes(data):
    '''
        Build the Naive Bayes classifier
    '''
    # create the classifier object
    naiveBayes_classifier = nb.GaussianNB()

    # fit the model
    return naiveBayes_classifier.fit(data[0], data[1])

# the file name of the dataset
r_filename = '../../Data/Chapter03/bank_contacts.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split the data into training and testing
train_x, train_y, \
test_x,  test_y, \
labels = hlp.split_data(
    csv_read, y = 'credit_application')

# train the model
classifier = fitNaiveBayes((train_x, train_y))

# classify the unseen data
predicted = classifier.predict(test_x)

# print out the results
hlp.printModelSummary(test_y, predicted)
コード例 #12
0
# the file name of the dataset
r_filename = '../../Data/Chapter05/bank_contacts.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split into independent and dependent features
x = csv_read[csv_read.columns[:-1]]
y = csv_read[csv_read.columns[-1]]

# split the original data into training and testing
train_x_orig, train_y_orig, \
test_x_orig,  test_y_orig, \
labels_orig = hlp.split_data(
    csv_read, 
    y = 'credit_application'
)

# reduce the dimensionality
csv_read['reduced'] = reduce_LDA(x, y).transform(x)

# split the reduced data into training and testing
train_x_r, train_y_r, \
test_x_r,  test_y_r, \
labels_r = hlp.split_data(
    csv_read, 
    y = 'credit_application',
    x = ['reduced']
)

# train the models
コード例 #13
0
    # and return the classifier
    return ann


# the file name of the dataset
r_filename = '../../Data/Chapter03/bank_contacts.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split the data into training and testing
train_x, train_y, \
test_x,  test_y, \
labels = hlp.split_data(
    csv_read,
    y = 'credit_application',
    x = ['n_duration','n_euribor3m','n_age','n_emp_var_rate','n_pdays','month_mar','prev_ctc_outcome_success','n_cons_price_idx','month_apr','n_cons_conf_idx']
)

# create the ANN training and testing datasets
training = hlp.prepareANNDataset((train_x, train_y))
testing = hlp.prepareANNDataset((test_x, test_y))

# train the model
classifier = fitANN(training)

# classify the unseen data
predicted = classifier.activateOnDataset(testing)

# the lowest output activation gives the class
predicted = predicted.argmin(axis=1)
コード例 #14
0

# the file name of the dataset
r_filename = '../../Data/Chapter03/bank_contacts.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split the data into training and testing
train_x, train_y, \
test_x,  test_y, \
labels = hlp.split_data(
    csv_read,
    y = 'credit_application',
    x = ['n_duration','n_nr_employed',
        'prev_ctc_outcome_success','n_euribor3m',
        'n_cons_conf_idx','n_age','month_oct',
        'n_cons_price_idx','edu_university_degree','n_pdays',
        'dow_mon','job_student','job_technician',
        'job_housemaid','edu_basic_6y']
)

# train the model
classifier = fitGradientBoosting((train_x, train_y))

# classify the unseen data
predicted = classifier.predict(test_x)

# print out the results
hlp.printModelSummary(test_y, predicted)

# print out the importance of features
コード例 #15
0
# In[3]:


# Loading dataset
filename = "Clean_Akosombo_data.csv"
akosombo = helper.load_csv_data(filename)


# ### Splitting Data

# In[4]:


# Splitting dataset
target_variable = "generation"
X, y, X_train, X_test, X_val, y_train, y_test, y_val = helper.split_data(akosombo, target_variable, validation_data=True)


# ### Scaling Data

# In[5]:


# Data Scaling
X_train, X_test, X_val = helper.scale(X_train, X_test, X_val, scale_validation=True)


# ### Model Creation

# In[9]:
コード例 #16
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # split data for training weak_learners and boosting
        (train_weak, train_boosting) = split_data(train_data, 4)
                  

        for task_id in train_data.keys():
            print "task_id:", task_id
            
        
        root = param.taxonomy.data
        
        # train on first part of dataset (evaluate on other)
        (classifiers, classifier_at_node) = self._inner_train(train_weak, param)

        # train on entire dataset
        (final_classifiers, final_classifier_at_node) = self._inner_train(train_data, param)

        ###

        print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
        print "done training weak learners"

        #####################################################
        #    perform boosting and wrap things up    
        #####################################################

        # wrap up predictors for later use
        predictors = {}

        for task_name in train_boosting.keys():
            
            
            instances = train_boosting[task_name]
            
            # get ids of predecessor nodes            
            node_names = [node.name for node in root.get_node(task_name).get_path_root()]
            node_names.append(task_name)
            
            print "node: %s --> %s" % (task_name, str(node_names))
            
            N = len(instances)
            
            
            if param.flags["use_all_nodes"]:
                # use classifiers only from parent nodes
                F = len(classifiers)
                tmp_classifiers = classifiers
                tmp_final_classifiers = final_classifiers
                
            else:
                # use classifiers from all leaves
                F = len(node_names)
                tmp_classifiers = []
                tmp_final_classifiers = []
            
            
            examples = [inst.example for inst in instances]
            labels = [inst.label for inst in instances]
            
            # dim = (F x N)
            out = cvxmod.zeros((N,F))
            
            for i in xrange(F):
                
                if param.flags["use_all_nodes"]:
                    svm = classifiers[i]
                else:
                    svm = classifier_at_node[node_names[i]]
                    tmp_classifiers.append(svm)

                    final_svm = final_classifier_at_node[node_names[i]]
                    tmp_final_classifiers.append(final_svm)
                    
                tmp_out = self._predict_weak(svm, examples, task_name)

                if param.flags["signum"]:
                    out[:,i] = numpy.sign(tmp_out)
                else:
                    out[:,i] = tmp_out
            
            
            if param.flags["boosting"] == "ones":
                weights = numpy.ones(F)/float(F)
            if param.flags["boosting"] == "L1":
                weights = solve_boosting(out, labels, param.transform, solver="glpk")
            if param.flags["boosting"] == "L2":            
                weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=False)
            if param.flags["boosting"] == "L2_reg":            
                weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=True)
            
            
            predictors[task_name] = (tmp_final_classifiers, weights)
            
        
        #####################################################
        #    Some sanity checks
        ##################################################### 
        
        # make sure we have the same keys (potentiall in a different order)  
        sym_diff_keys = set(train_weak.keys()).symmetric_difference(set(predictors.keys()))
        assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys)  


        # save graph plot
        mypath = "/fml/ag-raetsch/share/projects/multitask/graphs/"
        filename = mypath + "graph_" + str(param.id)
        root.plot(filename)#, plot_cost=True, plot_B=True)


        return predictors
コード例 #17
0
ファイル: reduce_LDA.py プロジェクト: anishpurohit/dropbox
# the file name of the dataset
r_filename = '../../Data/Chapter05/bank_contacts.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split into independent and dependent features
x = csv_read[csv_read.columns[:-1]]
y = csv_read[csv_read.columns[-1]]

# split the original data into training and testing
train_x_orig, train_y_orig, \
test_x_orig,  test_y_orig, \
labels_orig = hlp.split_data(
    csv_read,
    y = 'credit_application'
)

# reduce the dimensionality
csv_read['reduced'] = reduce_LDA(x, y).transform(x)

# split the reduced data into training and testing
train_x_r, train_y_r, \
test_x_r,  test_y_r, \
labels_r = hlp.split_data(
    csv_read,
    y = 'credit_application',
    x = ['reduced']
)

# train the models
コード例 #18
0
ファイル: main.py プロジェクト: yinliu13/MarketBasket
def runtopncustomers():
    embedding_wrapper = EmbeddingWrapper('product')
    bc = BasketConstructor('./data/', './data/')
    ub_basket = bc.get_baskets('prior', reconstruct=False)
    ok, ub_basket = train_test_split(ub_basket, test_size=0.20, random_state=0)
    #embedding_wrapper = EmbeddingWrapper('tafeng_products')
    #print(ub_basket)

    all_baskets = ub_basket.basket.values
    #print(all_baskets)
    #changes every item to string
    print("nested change")
    all_baskets = nested_change(list(all_baskets), str)
    print("embedding_wrapper.remove_products_wo_embeddings(all_baskets)")
    all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets)
    #print('test' ,all_baskets)
    #every customer sequence
    for s in range(2):
        print(all_baskets[s])
        #itemperklant = np.array([])
        itemperklant = []
        sizes = []
        top_nc = get_top_nc(all_baskets, 2)
        for i in range(len(all_baskets[s])):   # every basket in all baskets
            for j in range(len(all_baskets[s][i])): # every item in every basket
                #print('basket', all_baskets[s][i][j])
                itemperklant.append( all_baskets[s][i][j])
        print(itemperklant)
        unique_items = np.unique(itemperklant)
        print(unique_items)
        arrayklant = np.zeros((int(len(unique_items)), 2))
        arrayklant[:, 0] = unique_items
        for ding in range(len(unique_items)):
            countproduct = itemperklant.count(unique_items[ding])
            # itemperklant.append(countproduct)
            arrayklant[ding, 1] = countproduct

        print(arrayklant)

        sorted = arrayklant[np.argsort(arrayklant[:, 1])]
        print('sorted', sorted)
        product = np.array([])
        print('average length', top_nc[s])
        for reverse in range(int(top_nc[s])):
            print   ('test', sorted[-reverse - 1, :])
            product = np.append(product, sorted[-reverse, :])


    #print("uncommon products")
    #all_baskets = remove_products_which_are_uncommon(all_baskets)
    #print("short baskets")
    #medium_baskets, all_baskets = remove_short_baskets(all_baskets)
    #print(medium_baskets , all_baskets)
    #print("nested change")
    #all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f)
    #print("split_data")
    train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(all_baskets)
    #print('knndtw')
    #knndtw = KnnDtw(n_neighbors=[5])
    #preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, embedding_wrapper.basket_dist_REMD)
    #print(preds_all)
    #print(distances)
    #print("Wasserstein distance", sum(distances)/len(distances))
    #return preds_all, distances
    write_path = 'data/testprint'
    with open(write_path + '.txt', 'w') as results:
        results.write('All baskets test ' + str(all_baskets) + '\n')
    results.close()
コード例 #19
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """
        
        
        for task_id in train_data.keys():
            print "task_id:", task_id

        # split data for training weak_learners and boosting
        (train_weak, train_boosting) = split_data(train_data, 4)
        
        # train on first part of dataset (evaluate on other)
        prepared_data_weak = PreparedMultitaskData(train_weak, shuffle=False)
        classifiers = self._inner_train(prepared_data_weak, param)

        # train on entire dataset
        prepared_data_final = PreparedMultitaskData(train_data, shuffle=False)
        final_classifiers = self._inner_train(prepared_data_final, param)


        print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
        print "done training weak learners"

        #####################################################
        #    perform boosting and wrap things up    
        #####################################################

        # wrap up predictors for later use
        predictors = {}

        for task_name in train_boosting.keys():
                        
            instances = train_boosting[task_name]
            
            N = len(instances)
            F = len(classifiers)
            
            examples = [inst.example for inst in instances]
            labels = [inst.label for inst in instances]
            
            # dim = (F x N)
            out = cvxmod.zeros((N,F))
            
            for i in xrange(F):
                    
                svm = classifiers[i]
                        
                tmp_out = self._predict_weak(svm, examples, prepared_data_weak.name_to_id(task_name))

                if param.flags["signum"]:
                    out[:,i] = numpy.sign(tmp_out)
                else:
                    out[:,i] = tmp_out
            
            
            if param.flags["boosting"] == "ones":
                weights = numpy.ones(F)/float(F)
            if param.flags["boosting"] == "L1":
                weights = solve_boosting(out, labels, param.transform, solver="glpk")
            if param.flags["boosting"] == "L2":            
                weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=False)
            if param.flags["boosting"] == "L2_reg":            
                weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=True)
            
            
            predictors[task_name] = (final_classifiers, weights, prepared_data_final.name_to_id(task_name))
            
            
            assert prepared_data_final.name_to_id(task_name)==prepared_data_weak.name_to_id(task_name), "name mappings don't match"
            
        
        #####################################################
        #    Some sanity checks
        ##################################################### 
        
        # make sure we have the same keys (potentiall in a different order)  
        sym_diff_keys = set(train_weak.keys()).symmetric_difference(set(predictors.keys()))
        assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys)  


        return predictors
コード例 #20
0
# In[3]:


# Loading dataset
filename = "Clean_Akosombo_data.csv"
akosombo = helper.load_csv_data(filename)


# ### Splitting the Dataset

# In[4]:


# Splitting dataset
target_variable = "generation"
X, y, X_train, X_test, y_train, y_test = helper.split_data(akosombo, target_variable)


# ### Scaling the Dataset

# In[5]:


# Data Scaling
X_train, X_test = helper.scale(X_train, X_test)


# ### Chosing Baseline Models and Training Models

# In[6]:
コード例 #21
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # split for training weak_learners and boosting
        (train_weak, train_boosting) = split_data(train_data, 4)
          
        # merge data sets
        data = PreparedMultitaskData(train_weak, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        ##################################################
        # define pockets
        ##################################################
        
        pockets = [0]*9
        
        pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34]
        pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31]
        pockets[2] = [11, 20, 21, 22, 29, 31]
        pockets[3] = [8, 30, 31, 32]
        pockets[4] = [10, 11, 30]
        pockets[5] = [10, 11, 12, 13, 20, 29]
        pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29]
        pockets[7] = [12, 14, 15, 26]
        pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26]
        
        pockets = []
        for i in xrange(35):
            pockets.append([i])


        #new_pockets = []
        
        # merge neighboring pockets
        #for i in range(8):
        #    new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1]))))
            
        #pockets = new_pockets
        
        
        ########################################################
        print "creating a kernel:"
        ########################################################
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()

        
        classifiers = []


        for pocket in pockets:

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()
            
            normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
            
            print "processing pocket", pocket

            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = 0.0
                    
                    for pseudo_seq_pos in pocket:
                        similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1))
                    
                    # normalize
                    similarity = similarity / float(len(pocket))
                    
                    print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity)
                    
                    normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
               

            print "creating empty kernel"
            kernel = shogun_factory.create_kernel(data.examples, param)
            
            print "setting normalizer"
            kernel.set_normalizer(normalizer)

            print "training SVM for pocket", pocket
            svm = self._train_single_svm(param, kernel, lab)

            classifiers.append(svm)
        
        
        print "done obtaining weak learners"
            
        
        # save additional info
        #self.additional_information["svm_objective"] = svm.get_objective()
        #self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights()
        
        #print self.additional_information 
        


        ##################################################
        # combine weak learners for each task
        ##################################################
        
        
        # set constants
        
        some = 0.9
        import cvxmod
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_boosting.keys():
            
            instances = train_boosting[task_name]
            
            N = len(instances)
            F = len(pockets)
            
            examples = [inst.example for inst in instances]
            labels = [inst.label for inst in instances]
            
            # dim = (F x N)
            out = cvxmod.zeros((N,F))
            
            for i in xrange(F):
                svm = classifiers[i]
                tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name))

                out[:,i] = numpy.sign(tmp_out)
                #out[:,i] = tmp_out
            

            #TODO: fix
            helper.save("/tmp/out_sparse", (out,labels))
            pdb.set_trace()
            
            weights = solve_boosting(out, labels, some, solver="mosek")
            
            
            
            svms[task_name] = (data.name_to_id(task_name), svm)

        
        return svms
コード例 #22
0
    trainer.trainUntilConvergence(maxEpochs=50, verbose=True, 
        continueEpochs=2, validationProportion=0.25)

    # and return the regressor
    return ann

# the file name of the dataset
r_filename = '../../Data/Chapter06/power_plant_dataset_pc.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split the data into training and testing
train_x, train_y, \
test_x,  test_y, \
labels = hlp.split_data(csv_read, 
    y='net_generation_MWh', x=['total_fuel_cons_mmbtu'])

# create the ANN training and testing datasets
training = hlp.prepareANNDataset((train_x, train_y), 
    prob='regression')
testing  = hlp.prepareANNDataset((test_x, test_y),
    prob='regression')

# train the model
regressor = fitANN(training)

# predict the output from the unseen data
predicted = regressor.activateOnDataset(testing)

# and calculate the R^2
score = hlp.get_score(test_y, predicted[:, 0])
コード例 #23
0
# Accuracy
correct_pred = tf.equal(tf.argmax(logits, axis=1), tf.argmax(y, axis=1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

############ DATA ############
# inputs: two integers
# targets: [0, 1] if the sum is higher than 10. [1, 0] if the sum is lower than 10.

inputs, targets = get_data(max_int=10, size=10000)

# preprocessing: normalize inputs to be between -1 and 1.
inputs = (inputs - 5) / 5
# TODO: make a preprocessing helper function, substracting the mean and dividing by the max value

# split train and test data
train_inputs, test_inputs, train_targets, test_targets = split_data(
    inputs, targets)

############ SESSION ############
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # TRAINING
    for epoch in range(epochs):

        batch_x, batch_y = get_batches(train_inputs, train_targets, batch_size)

        for batch in range(train_inputs.shape[0] // batch_size):

            sess.run(optimizer,
                     feed_dict={
                         x: batch_x[batch],
コード例 #24
0
conditions = pd.read_csv(PATH_DATA + 'conditions_mergedGenes.tsv',
                         sep='\t',
                         index_col=None)

STRAIN_ORDER = pd.read_table(PATH_DATA + 'strain_order.tsv',
                             header=None).values.ravel()
# *****************
# *** Expression averaged across replicates of strains by timepoint

# *** Average RPKUM data of strains across timepoints
# Combine expression and metadata
merged = merge_genes_conditions(
    genes=genes,
    conditions=conditions[['Time', 'Measurment', 'Strain']],
    matching='Measurment')
splitted = split_data(data=merged, split_by='Strain')
# Average by strain
by_strain = {}
for split, data in splitted.items():
    genes_avg_stage = data.groupby('Time').mean()
    by_strain[split] = genes_avg_stage.T
# Combine data of different strains and add metadata
strains_data = []
for strain, data in by_strain.items():
    strain_data = pd.DataFrame({
        'Strain': [strain] * data.shape[1]
    },
                               index=data.columns).T.append(data)
    strains_data.append(strain_data)
genes_avg = pd.concat(strains_data, axis=1).T
genes_avg['Time'] = genes_avg.index
    return forest.fit(data[0],data[1])

# the file name of the dataset
r_filename = '../../Data/Chapter03/bank_contacts.csv'

# read the data
csv_read = pd.read_csv(r_filename)

# split the data into training and testing
train_x, train_y, \
test_x,  test_y, \
labels = hlp.split_data(
    csv_read, 
    y = 'credit_application',
    x = ['n_duration','n_nr_employed',
        'prev_ctc_outcome_success','n_euribor3m',
        'n_cons_conf_idx','n_age','month_oct',
        'n_cons_price_idx','edu_university_degree','n_pdays',
        'dow_mon','job_student','job_technician',
        'job_housemaid','edu_basic_6y']
)

# train the model
classifier = fitRandomForest((train_x, train_y))

# classify the unseen data
predicted = classifier.predict(test_x)

# print out the results
hlp.printModelSummary(test_y, predicted)

# print out the importance of features
コード例 #26
0
VOCAB_SIZE_ENCODER = len(metadata['idx2w'])
VOCAB_SIZE_DECODER = VOCAB_SIZE_ENCODER  # since same language
EMBED_DIMS = 200
HIDDEN_UNITS = 200
NUMBER_OF_LAYERS = 3
LEARNING_RATE = 0.001
DROPOUT = 0.5

BATCH_SIZE = 32
TEST_EPOCHS = 1
SAVED_MODEL_DIR = 'saved_model_seq2seq'
# shuffling data
idxQ, idxA = helper.shuffle_data(idxQ, idxA)

# splitting data into train, test, validation
trainX, trainY, testX, testY, valX, valY = helper.split_data(
    idxQ, idxA, TRAIN_DATA_PERCENT, TEST_DATA_PERCENT, VAL_DATA_PERCENT)

# creating model class object
model = seq2seq_model(vocabSizeEncoder=VOCAB_SIZE_ENCODER,
                      vocabSizeDecoder=VOCAB_SIZE_DECODER,
                      maxLenX=MAX_LEN_X,
                      maxLenY=MAX_LEN_Y,
                      embedDims=EMBED_DIMS,
                      numLayers=NUMBER_OF_LAYERS,
                      hiddenUnits=HIDDEN_UNITS,
                      lr=LEARNING_RATE)

# re- building tensorflow graph

model.build_model_graph()
コード例 #27
0
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

from helper import get_data, split_data, visualize

name = 'Support vector'

if __name__ == '__main__':
    x, y = get_data()
    y = y.reshape(len(y), 1)
    x_train, x_test, y_train, y_test = split_data(x, y)

    x_scaler = StandardScaler()
    y_scaler = StandardScaler()

    x_train = x_scaler.fit_transform(x_train)
    x_test = x_scaler.transform(x_test)

    y_train = y_scaler.fit_transform(y_train)
    y_test = y_scaler.transform(y_test)

    regression = SVR(kernel='rbf')
    regression.fit(x_train, y_train)

    y_predicted = regression.predict(x_test)
    y_predicted = y_scaler.inverse_transform(y_predicted)
    y_test = y_scaler.inverse_transform(y_test)

    visualize(y_test, y_predicted, name)
コード例 #28
0
    def _train(self, train_data, param):
        """
        training procedure using training examples and labels
        
        @param train_data: Data relevant to SVM training
        @type train_data: dict<str, list<instances> >
        @param param: Parameters for the training procedure
        @type param: ParameterSvm
        """

        # split for training weak_learners and boosting
        (train_weak, train_boosting) = split_data(train_data, 4)
          
        # merge data sets
        data = PreparedMultitaskData(train_weak, shuffle=True)
        
        # create shogun label
        lab = shogun_factory.create_labels(data.labels)
        


        
        
        ########################################################
        print "creating a kernel:"
        ########################################################
        
        
        # init seq handler 
        pseudoseqs = SequencesHandler()

        
        classifiers = []


        for pocket in pockets:

            print "creating normalizer"
            #import pdb
            #pdb.set_trace()
            
            normalizer = MultitaskKernelNormalizer(data.task_vector_nums)
            
            print "processing pocket", pocket

            # set similarity
            for task_name_lhs in data.get_task_names():
                for task_name_rhs in data.get_task_names():
                    
                    similarity = 0.0
                    
                    for pseudo_seq_pos in pocket:
                        similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1))
                    
                    # normalize
                    similarity = similarity / float(len(pocket))
                    
                    print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity)
                    
                    normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity)
               

            print "creating empty kernel"
            kernel = shogun_factory.create_kernel(data.examples, param)
            
            print "setting normalizer"
            kernel.set_normalizer(normalizer)

            print "training SVM for pocket", pocket
            svm = self._train_single_svm(param, kernel, lab)

            classifiers.append(svm)
        
        
        print "done obtaining weak learners"
            
        
        # save additional info
        #self.additional_information["svm_objective"] = svm.get_objective()
        #self.additional_information["svm num sv"] = svm.get_num_support_vectors()
        #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights()
        
        #print self.additional_information 
        


        ##################################################
        # combine weak learners for each task
        ##################################################
        
        
        # set constants
        
        some = 0.9
        import cvxmod
        
        
        # wrap up predictors
        svms = {}
            
        # use a reference to the same svm several times
        for task_name in train_boosting.keys():
            
            instances = train_boosting[task_name]
            
            N = len(instances)
            F = len(pockets)
            
            examples = [inst.example for inst in instances]
            labels = [inst.label for inst in instances]
            
            # dim = (F x N)
            out = cvxmod.zeros((N,F))
            
            for i in xrange(F):
                svm = classifiers[i]
                tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name))

                out[:,i] = numpy.sign(tmp_out)
                #out[:,i] = tmp_out
            

            #TODO: fix
            helper.save("/tmp/out_sparse", (out,labels))
            pdb.set_trace()
            
            weights = solve_boosting(out, labels, some, solver="mosek")
            
            
            
            svms[task_name] = (data.name_to_id(task_name), svm)

        
        return svms