Example #1
0
def get_simulation_data(simulation_name, simulation_parameters,
                        test_set_size=4000, validation_set_size=3200):
    simulation_function = get_simulation_function(simulation_name)
    try:
        sequences, y = simulation_function(**simulation_parameters)
    except Exception as e:
        return

    if simulation_name=="simulate_heterodimer_grammar":
        motif_names = [simulation_parameters["motif1"],
                       simulation_parameters["motif2"]]
    elif simulation_name=="simulate_multi_motif_embedding":
        motif_names = simulation_parameters["motif_names"]
    else:
        motif_names = [simulation_parameters["motif_name"]]

    train_sequences, test_sequences, y_train, y_test = train_test_split(
        sequences, y, test_size=test_set_size)
    train_sequences, valid_sequences, y_train, y_valid = train_test_split(
        train_sequences, y_train, test_size=validation_set_size)
    X_train = one_hot_encode(train_sequences)
    X_valid = one_hot_encode(valid_sequences)
    X_test = one_hot_encode(test_sequences)

    return Data(X_train, X_valid, X_test, y_train, y_valid, y_test, motif_names)
def test_thresholded_scorers():
    """Test scorers that take thresholds."""
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = SCORERS['log_loss'](clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = SCORERS['roc_auc'](clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
Example #3
0
def xgb_semi_supervised(trainX,trainY,X_unlabeled,Y_unlabeled):
    row_count =trainX.shape[0]
    trainX = np.hstack((trainX,np.array(word_dist_list).reshape(row_count,1)))
    trainX = np.hstack((trainX,np.array(time_dist_list).reshape(row_count,1)))
    
    row_count =X_unlabeled.shape[0]
    X_unlabeled = np.hstack((X_unlabeled,np.array(word_dist_list_unlabeled).reshape(row_count,1)))
    X_unlabeled = np.hstack((X_unlabeled,np.array(time_dist_list_unlabeled).reshape(row_count,1)))
    
    X_unlabeled,_,Y_unlabeled,_ = train_test_split(X_unlabeled, Y_unlabeled, test_size=0.85, random_state=20)
    
    x_train, x_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.25, random_state=20)
    
    #concatenate x_train,y_train w/ x_unlabeled and y_unlabeled repectively
    x_ = np.concatenate((x_train,X_unlabeled),axis=0)
    x_ = sparse.csr_matrix(x_)
    y_ = np.concatenate((y_train,Y_unlabeled),axis=0)
    #y_ = sparse.csr_matrix(y_)
    
    #unlabeled_indices = np.arange(x_shape[0])[x_train.shape[0]:]
    
    label_prop_model = label_propagation.LabelSpreading(kernel='knn', alpha=1.0)
    label_prop_model.fit(x_.toarray(),y_)
    y_pred = label_prop_model.transduction_
    
    #y_ = label_prop_model.predict(x_)
    xgb_model(x_,x_test,y_pred,y_test)
Example #4
0
def splitDataset(data, random_seed):
    '''
    Given a dataframe and a seed value, this function splits out the dataframe into a training set, a validation set, and a test set using the provided seed value for consistency. It uses a 60/20/20 split, but this could easily be parameterized and passed into the function. It returns a dictionary of dataframes with keys train, valid and test.
    '''
    #Get column headers
    col_headers = list(data.columns.values)
    feature_cols = copy.deepcopy(col_headers)
    feature_cols.remove('Sample')
    feature_cols.remove('Diagnosis')
    class_col = ['Diagnosis']
    
    #Train/test/validate split
    train, test = train_test_split(data, test_size=0.2, random_state=random_seed)
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)
    train.columns = col_headers
    test.columns = col_headers
    train, validate = train_test_split(train, test_size=0.25, random_state=random_seed)
    train = pd.DataFrame(train)
    validate = pd.DataFrame(validate)
    train.columns = col_headers
    validate.columns = col_headers
    
    #Separate features and classes
    all_data = {'train': train, 'valid': validate, 'test': test}
    return extractFeatures(all_data)
Example #5
0
def processMethod3(userid, featureCondition=1, classificationCondition=1, offsetFeatureOn=False):
    """ User-i Device-j hack in User-i Device-k Model: iphone6plus hack iphone5

    Returns
    -------
    float : error rate
    """
    # rawDataiPhone6Plus = loadUserData(userid, 1, datatype=1) # moment data
    # rawDataiPhone5     = loadUserData(userid, 2, datatype=1) # moment data

    # trainingData  = splitMomentDataByFeature(rawDataiPhone5, featureCondition=featureCondition)
    # trainingLabel = rawDataiPhone5[:, 4]

    # testData  = splitMomentDataByFeature(rawDataiPhone6Plus, featureCondition=featureCondition)
    # testLabel = rawDataiPhone6Plus[:, 4]

    iPhone6Plus = 1
    iPhone5     = 2
    trainingData, trainingLabel = splitMomentDataByFeatureAndLabel(userid, iPhone5, featureCondition, classificationCondition, offsetFeatureOn=offsetFeatureOn)
    testData, testLabel         = splitMomentDataByFeatureAndLabel(userid, iPhone6Plus, featureCondition, classificationCondition, offsetFeatureOn=offsetFeatureOn)

    # use same test size with method1
    trainingDataIP5, testDataIP5, trainingLabelIP5, testLabelIP5 = train_test_split(trainingData, trainingLabel, test_size=my_test_size, random_state=my_random_state)
    trainingDataIP6, testDataIP6, trainingLabelIP6, testLabelIP6 = train_test_split(    testData,     testLabel, test_size=my_test_size, random_state=my_random_state)

    return classify(trainingDataIP5, trainingLabelIP5, testDataIP6, testLabelIP6, kernel=my_kernel, max_iter=my_max_iteration)
Example #6
0
def dump_data_2_pickle(gsr_file, pickleFile):
    """
    dump the txt gsr file data into picke
    :type gsr_file: string
    :param gsr_file: path to gsr file, default: gsr_article/gsr_spanish.txt

    :type pickleFile: string
    :param pickleFile: path to pickle file, default: ../data/dataset.pkl
    """
    # generate docs and gsrs
    docs, gsrs = generate_docs(gsr_file)
    # shuffle the data
    dataset = zip(docs, gsrs)
    train_set, test_set = train_test_split(dataset, 
            test_size=0.3, 
            random_state=10)
    valid_set, test_set = train_test_split(test_set, 
            test_size=0.5, 
            random_state=11)

    # construct the vocab list and transfer the data into word num
    word2id = {}
    # set UNKNOW word as UUKK
    word2id["UNK"] = 0
    word2id["<S>"] = 1
    word2id["</S>"] = 2
    word2id["<PAD>"] = 3
    pop2id = {}
    type2id = {}

    wid = 4
    pid = 0
    tid = 0
    for doc, gsr in train_set:
        for sen in doc:
            for token in sen:
                if token not in word2id:
                    word2id[token] = wid
                    wid += 1
        pop = gsr["population"]
        eType = gsr["eventType"]
        if pop not in pop2id:
            pop2id[pop] = pid
            pid += 1

        if eType not in type2id:
            type2id[eType] = tid
            tid += 1

    train_set = transform_set(train_set, word2id, pop2id, type2id)
    valid_set = transform_set(valid_set, word2id, pop2id, type2id)
    test_set = transform_set(test_set, word2id, pop2id, type2id)

    with open(pickleFile, 'w') as pf:
        cPickle.dump(train_set, pf)
        cPickle.dump(valid_set, pf)
        cPickle.dump(test_set, pf)
        cPickle.dump(word2id, pf)
        cPickle.dump(pop2id, pf)
        cPickle.dump(type2id, pf)
 def tribunalTrain(data,predict,tribunal,split=.2,stat=False,statLis=None):
     #data for testing the tribunal performance, not in actual judge training
     dat_train, dat_test, lab_train, lab_test = train_test_split(data,predict, test_size=split)
     verdict = []
      
     print 'Tribunal in session'
     
     for judge in tribunal:
         jdat_train, jdat_test, jlab_train, jlab_test = train_test_split(dat_train,lab_train, test_size=split)
         judge.fit(jdat_train, jlab_train)
         print 'judge trained'
 
     for d in dat_test:
         votes = []
         for judge in tribunal:
             v = judge.predict(d)
             votes.append(v)
         decision = stats.mode(votes,axis=None)
         verdict.append(decision[0])
     npVerdict = np.array(verdict)
     
     if stat == False:        
         svmDesc(npVerdict,lab_test,title='Tribunal Confusion Matrix')
     else:
         jac = jaccard_similarity_score(npVerdict,lab_test)
         statLis.append(jac)
Example #8
0
def Adaboost(TrainData,TestData):
    features=['Time','Season','Hour','Minute','District']

    clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30)

    size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    for i in range(0,len(size)):
        train,validation= train_test_split(TrainData, train_size=size[i])

        while len(set(train['Category'])) != len(set(validation['Category'])):
            train,validation= train_test_split(TrainData, train_size=size[i])
        clf = clf.fit(train[features], train['Category'])
        """stop = timeit.default_timer()
        print "Runnin  time adaboost is ", stop-start"""
        predicted=np.array(clf.predict_proba(validation[features]))
        model=clf.predict(train[features])
        model1=clf.predict(validation[features])

        #scores = cross_val_score(clf, validation[features], validation['Category'])
        #print "Scores mean is",scores.mean()
        #accuracy
        print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model)
        print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1)
        print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro')
        print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None)


        #writing to file
        """Category_new=[]
def tuning_l2_penalty(out_file, featurizers = None):
    # featurizers for blog/blog, twitter+wiki/blog and twitter+wiki/twitter+wiki respectively
    if not featurizers:
        featurizers = [feat4, feat5, feat4]
    # used to weigh L-2 penalty
    c_vals = [ v / 100.0 for v in range(50, 110, 10)]
    # data splits used
    b_train, b_test = train_test_split(blog_80, test_size = 0.1, random_state = 1)
    tw_train, tw_test = train_test_split(tw, test_size = 0.1, random_state = 1)
    # count sizes only once
    n_btest = float(len(b_test))
    n_b80 = float(len(blog_80))
    n_twtest = float(len(tw_test))

    for c_val in c_vals:
        print "Running l-2 tunning for C:%.2f" % c_val
        # Using split validation, as otherwise too slow
        make_model = lambda: Models.LogisticRegression(C = c_val)
        blog_errors = error_analyze(make_model, b_train, b_test, featurizers[0])
        twb_errors = error_analyze(make_model, tw, blog_80, featurizers[1])
        tw_errors = error_analyze(make_model, tw_train, tw_test, featurizers[2])

        blog_acc = 1 - len(blog_errors["error_indices"]) / n_btest
        twb_acc = 1 - len(twb_errors['error_indices']) / n_b80
        tw_acc = 1 - len(tw_errors['error_indices']) / n_twtest
        # write to file provided
        out_file.write("C=%f\n" % c_val)
        out_file.write("b=%f, twb=%f, tw=%f\n\n" % (blog_acc, twb_acc, tw_acc))
Example #10
0
def split_train_test_with_common_vocabulary(sparse_data: dict, test_size: float):
    # seed = random.randint(0, 2 ** 32)
    # TODO: Enable
    seed = 1

    train = {"unigrams": sparse_data["unigrams"], "counts": {}}
    test = {"unigrams": sparse_data["unigrams"], "counts": {}}

    coordinates_train, coordinates_test = cross_validation.train_test_split(sparse_data["coordinates"],
                                                                            test_size=test_size,
                                                                            random_state=seed)

    train["coordinates"] = coordinates_train
    test["coordinates"] = coordinates_test

    features = (feature for feature in sparse_data.keys() if feature not in ["coordinates", "counts", "unigrams"])

    for feature in features:
        sparse_train, sparse_test = cross_validation.train_test_split(sparse_data[feature], test_size=test_size,
                                                                      random_state=seed)
        train[feature] = sparse_train
        test[feature] = sparse_test

        # [0] is because this is a matrix, so we get list of lists
        train["counts"][feature] = np.asarray(sparse_train.sum(axis=0)).flatten().tolist()
        test["counts"][feature] = np.asarray(sparse_test.sum(axis=0)).flatten().tolist()

    return train, test
def test_train_test_split():
    X = np.arange(100).reshape((10, 10))
    X_s = coo_matrix(X)
    y = np.arange(10)

    # simple test
    split = cval.train_test_split(X, y, test_size=None, train_size=.5)
    X_train, X_test, y_train, y_test = split
    assert_equal(len(y_test), len(y_train))
    # test correspondence of X and y
    assert_array_equal(X_train[:, 0], y_train * 10)
    assert_array_equal(X_test[:, 0], y_test * 10)

    # conversion of lists to arrays (deprecated?)
    split = cval.train_test_split(X, X_s, y.tolist(), allow_lists=False)
    X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
    assert_array_equal(X_train, X_s_train.toarray())
    assert_array_equal(X_test, X_s_test.toarray())

    # don't convert lists to anything else by default
    split = cval.train_test_split(X, X_s, y.tolist())
    X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
    assert_true(isinstance(y_train, list))
    assert_true(isinstance(y_test, list))

    # allow nd-arrays
    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
    split = cval.train_test_split(X_4d, y_3d)
    assert_equal(split[0].shape, (7, 5, 3, 2))
    assert_equal(split[1].shape, (3, 5, 3, 2))
    assert_equal(split[2].shape, (7, 7, 11))
    assert_equal(split[3].shape, (3, 7, 11))
Example #12
0
def load_data():
    '''
    Loads the data, turns into word2vec representation, and splits
    into training, validation, and testing sets with ratio 8:1:1
    '''
    trainingDataFile = '../data/traindata.txt'
    trainingPosFile = '../data/pos_Embedding.txt'
    trainingLabelFile = '../data/trainlabel.txt'
    wordToVecDictFile = '../data/glove/glove.6B.50d.txt'
    print('Vectorizing the features and labels...')
    start_time = timeit.default_timer()
    X,Y = word2vec.createVecFeatsLabels(trainingDataFile,trainingPosFile,trainingLabelFile,wordToVecDictFile,window_size)
    end_time = timeit.default_timer()
    print('Pickling the vectorization files')
    # pickling X-file
    clean_data = open('../data/clean_data.pkl','wb')
    pickle.dump(X, clean_data)
    clean_data.close()
    # pickling the labels-file
    clean_label = open('../data/clean_label.pkl', 'wb')
    pickle.dump(Y, clean_label)
    clean_label.close()
    print(('The vectorization ran for %.2fm' % ((end_time - start_time) / 60.)))
    print('Splitting into training, validation, and testing sets ...')
    X_train, X_rest, y_train, y_rest = train_test_split(X, Y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_rest,y_rest, test_size=0.5, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test
Example #13
0
def load_dataset(path_id="", folder="", use_float_32=False, test_ratio=0.3, valid_ratio=0.1):	
#def load_dataset(path_id="", use_float_32=False, test_ratio=0.2, valid_ratio=0.1):
	# reading full dataset
	features_path = "data/%s/features%s.npy"%(folder, path_id)
	labels_path = "data/%s/labels%s.npy"%(folder, path_id)
	

	features = np.load(features_path)
	if use_float_32:
		features = features.astype(np.float32)
	labels = np.load(labels_path)
	
	# splitting data
	train_set_x, test_set_x, train_set_y, test_set_y = train_test_split(features, labels, test_size=test_ratio, random_state=89677)
	#train_set_x = features[:2500]
	#train_set_y = labels[:2500]
	
	#test_set_x = features[2500:]
	#test_set_y = labels[2500:]
	test_set_x = theano.shared(value=test_set_x, name='test_set_x', borrow=True)
	test_set_y = theano.shared(value=np.array(test_set_y), name='test_set_y', borrow=True)
	
	# split train set into validation set
	train_set_x, valid_set_x, train_set_y, valid_set_y = train_test_split(train_set_x, train_set_y, test_size=valid_ratio, random_state=89677)
	
	print train_set_x.shape, valid_set_x.shape, test_set_x.get_value(borrow=True).shape
	
	train_set_x = theano.shared(value=train_set_x, name='train_set_x', borrow=True)
	train_set_y = theano.shared(value=np.array(train_set_y), name='train_set_y', borrow=True)
	
	valid_set_x = theano.shared(value=valid_set_x, name='valid_set_x', borrow=True)
	valid_set_y = theano.shared(value=np.array(valid_set_y), name='valid_set_y', borrow=True)
	
	return ((train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y))	
Example #14
0
def load_data_sets(input_data, labels, split_only=True, valid_set=False):
    class DataSets(object):
        pass
    data_sets = DataSets()

    print("\nSplitting to Train & Test sets for Finetuning")

    if valid_set:
        train_examples, test_examples, train_labels, test_labels = \
                        train_test_split(input_data, labels, test_size=0.2)
        train_examples, validation_examples, train_labels, validation_labels = \
                        train_test_split(train_examples, train_labels, test_size=0.05)
        data_sets.validation = DataSet(validation_examples, validation_labels)
    else:
        train_examples, test_examples, train_labels, test_labels = \
                        train_test_split(input_data, labels, test_size=0.3)
        data_sets.validation = None

#     validation_examples = input_data[:VALIDATION_SIZE]
#     train_examples = input_data[VALIDATION_SIZE:]

    data_sets.train = DataSet(train_examples, train_labels)
    data_sets.test = DataSet(test_examples, test_labels)
    
    if not split_only:
        data_sets.all = DataSet(input_data, labels)
    
    return data_sets
Example #15
0
def get_best_k_model(model, max_k, x, y):
    # Fit a model using a range of best-k values, 
    # returning the model that produces the best test score
    
    # Input
    # model: scikit-learn model
    # max_k: maximum k-value to iterate to (inclusive)
    # x: independent variables
    # y: dependent variable
    
    # Output
    # best_k: Number of dependent variables using to produce output
    # train_score: training score
    # test_score: test score
    # train_mse: training mse
    # test_mse: test mse       
    
    test_scores = []
    k_vals = []    
    
    k_limit = min(max_k, len(x.columns))
    for k_val in range(1, k_limit + 1):
        best_x = fs.SelectKBest(fs.chi2, k = k_val).fit_transform(x, y)
        x_train, x_test, y_train, y_test = cv.train_test_split(best_x, y, test_size = 0.2, random_state = 0)
        test_scores.append(model.fit(x_train, y_train).score(x_test, y_test))
        k_vals.append(k_val)

    best_k = k_vals[np.argmax(test_scores)]
    best_x = fs.SelectKBest(fs.chi2, k = best_k).fit_transform(x, y)
    x_train, x_test, y_train, y_test = cv.train_test_split(best_x, y, test_size = 0.2, random_state = 0)
       
    train_score, test_score, train_mse, test_mse = get_model_values(model, x_train, y_train, x_test, y_test)
    
    return best_k, train_score, test_score, train_mse, test_mse
Example #16
0
def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1):
    if verbose > 0:
        print "[Start]", params

    thresholds, scores = [], []

    for i in range(n_repeat):
        if verbose > 0:
            print "Fold", i

        _, X_fold, _, y_fold, _, w_fold = train_test_split(X, y, w, train_size=0.5, random_state=i)
        X_pred = load_predictions("stack/*-fold%d.npy" % i)
        X_fold = np.hstack((X_fold, X_pred))

        X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(X_fold, y_fold, w_fold, train_size=0.33, random_state=i)
        X_train = np.asfortranarray(X_train, dtype=np.float32)

        w_train = rescale(w_train)
        w_train = rebalance(y_train, w_train)

        clf = Classifier(**params)
        try:
            clf = clf.fit(X_train, y_train, sample_weight=w_train)
        except:
            clf = clf.fit(X_train, y_train)

        threshold, score, _ = find_threshold(clf, X_valid, y_valid, w_valid)

        thresholds.append(threshold)
        scores.append(score)

    if verbose > 0:
        print "[End]", params, np.mean(thresholds), np.mean(scores)

    return (np.mean(scores), np.mean(thresholds), params, thresholds, scores)
Example #17
0
def cook():
    x, y, weights = load_data()
    n_components = 200
    svd = TruncatedSVD(n_components, random_state=42)
    x_unweighted = svd.fit_transform(x)
    x_weighted = svd.fit_transform(weighted(x, weights))

    for i in range(9):
        frac = 1 - (i * 0.01 + 0.01)
        print frac

        x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Unweighted: ", classifier.score(x_test, y_test)

        x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Weighted: ", classifier.score(x_test, y_test)

        print '--------------------------'


    '''
Example #18
0
def split(data, size):
    grouped = data.groupby('LOG BB')
    bbb_neg = grouped.get_group(0.0)
    bbb_pos = grouped.get_group(1.0)
    descriptor_n = bbb_neg.shape[1]

    # descriptor_n = 2756
    # descriptor_n = 30
    n = bbb_neg.shape[0]
    # n = 850
    # n = 0


    x_pos = bbb_pos.iloc[:n,0:descriptor_n-1].values
    y_pos = bbb_pos.iloc[:n,descriptor_n-1:descriptor_n].values
    x_pos_train, x_pos_test, y_pos_train, y_pos_test = train_test_split(x_pos, y_pos, test_size=size, random_state=100)

    x_neg = bbb_neg.iloc[:,0:descriptor_n-1].values
    y_neg = bbb_neg.iloc[:,descriptor_n-1:descriptor_n].values
    x_neg_train, x_neg_test, y_neg_train, y_neg_test = train_test_split(x_neg, y_neg, test_size=size, random_state=100)

    x_train = np.append(x_pos_train, x_neg_train, axis = 0)
    y_train = np.append(y_pos_train, y_neg_train, axis = 0)
    x_test = np.append(x_pos_test, x_neg_test, axis = 0)
    y_test = np.append(y_pos_test, y_neg_test, axis = 0)


    return x_train, x_test, y_train, y_test
Example #19
0
def get_splitted_data_for_Cyst():
    # Load the raw data
    (all_images, image_class) = loadImages_for_Cyst()

    # test / train split

    #X_, X_test, y_, Y_test = train_test_split(all_images, image_class, test_size=0.20, random_state=42)

    #X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.20, random_state=42)



    # Divide the data into a train and test set.
    X_train, X_test, T_train, T_test = cross_validation.train_test_split(all_images, image_class, test_size=0.2)
    # Divide the test set into a validation set and final test set.
    X_validation, X_test, T_validation, T_test = cross_validation.train_test_split(X_test, T_test, test_size=0.2)
    #print("Total: ", len(all_images), "Train", str(len(X_train)), ", Val: ", len(X_val), ",Test: ", len(X_test))

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -=mean_image
    X_validation -= mean_image
    X_test -=mean_image

    return X_train, T_train, X_validation, T_validation, X_test, T_test
Example #20
0
def conv_demo():
    # load the digits dataset
    digits = load_digits()
    X = digits['data']
    y_labels = digits['target']

    lb = LabelBinarizer()
    y = lb.fit_transform(y_labels)

    # split into training, validation and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.25,
                                                        random_state=RANDOM_STATE)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                          test_size=0.25,
                                                          random_state=RANDOM_STATE)

    # train the neural net
    print("Building neural net to classify digits")
    conv_net = pynn.ConvNet(digits['images'][0].shape, 1, y.shape[1],
                            random_state=RANDOM_STATE)
    print("Training")
    conv_net.fit(X_train, y_train, X_valid, y_valid,
                 batch_size=20, n_epochs=20, learning_rate=0.05)

    y_pred = conv_net.predict(X_test)

    print("digits accuracy: {}%".format(
        accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
    def __init__(self, data, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False, fraction_test=0.01):
        if batch_range == None:
            raise Exception('the range is empty')
        if init_batchnum is None or init_batchnum not in batch_range:
            init_batchnum = batch_range[0]

        self.data_dir = None
        self.batch_range = batch_range
        self.curr_epoch = init_epoch
        self.curr_batchnum = init_batchnum
        self.dp_params = dp_params
        self.batch_meta = None
        self.data_dic = None
        self.test = test
        self.batch_idx = batch_range.index(init_batchnum)


        self.X = data[0]
        self.y = data[1]
        self.fraction_test = fraction_test
        if self.y is not None:
            print 'data is: {}, X shape {}, y shape {}'.format(len(data), self.X.shape,self.y.shape)
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y,
                                                                                test_size=self.fraction_test,
                                                                                random_state=42)
        else:
            print 'data is: {}, X shape {}'.format(len(data), self.X.shape)
            self.X_train, self.X_test = train_test_split(self.X,
                                                                                test_size=self.fraction_test,
                                                                                random_state=42)
            self.y_train = np.array([0] * self.X_train.shape[0],dtype=np.float32)
            self.y_test = np.array([0] * self.X_test.shape[0],dtype=np.float32)
Example #22
0
def main():
    X, Y, encoder, scale = load_train_data('train.csv')
    estimators = 500
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=0)
    X_train_real, X_test_real, Y_train_real, Y_test_real = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
    log.info('Loaded training file')
    X_test, _ = load_csv_file('test.csv', cut_end=False)
    log.info('Loaded test file')

    #Classifier Setup
    tree_clf = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1,
                                    random_state=42, max_depth=55, min_samples_split=1)

    clf = make_pipeline(TfidfTransformer(), DenseTransformer(), tree_clf)
    log.info('Fitting GradientBoost')
    clf.fit(X_train_real, Y_train_real)
    clf_probs = clf.predict_proba(X_test_real)
    score = log_loss(Y_test_real, clf_probs)
    log.info('Log Loss score un-trained = %f' % score)
    # Calibrate Classifier using ground truth in X,Y_valid
    sig_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit")
    log.info('Fitting CalibratedClassifierCV')
    sig_clf.fit(X_valid, Y_valid)
    sig_clf_probs = sig_clf.predict_proba(X_test_real)
    sig_score = log_loss(Y_test_real, sig_clf_probs)
    log.info('Log loss score trained = %f' % sig_score)

    # Ok lets predict the test data with our funky new classifier
    sig_submission_probs = sig_clf.predict_proba(X_test)

    write_out_submission(sig_submission_probs, 'submission.csv')
Example #23
0
def getImages():
   digitsImagesNormalized = getImagesFromDir(digitsPath)
   lettersImagesNormalized = getImagesFromDir(lettersPath)

   digitsImagesNormalized = [skpre.scale(digitsImagesNormalized[0]), digitsImagesNormalized[1]]
   lettersImagesNormalized = [skpre.scale(lettersImagesNormalized[0]), lettersImagesNormalized[1]]

   allImages = []
   for i in digitsImagesNormalized[0]:
      allImages.append(i)

   for i in lettersImagesNormalized[0]:
      allImages.append(i)

   # Divide em teste e treino.
   # Calcula PCA - Reducao de dimensionalidade dos dados. :)
   pca = computePCA(allImages)
   digitstransformedData = pca.transform(digitsImagesNormalized[0])
   letterstransformedData = pca.transform(lettersImagesNormalized[0])

   dtrainDataTF, dtestDataTF, dclassesTrainTF, dclassesTestTF = train_test_split(digitstransformedData, digitsImagesNormalized[1], train_size=0.65)

   ltrainDataTF, ltestDataTF, lclassesTrainTF, lclassesTestTF = train_test_split(letterstransformedData, lettersImagesNormalized[1], train_size=0.65)
   
   return [[dtrainDataTF, dclassesTrainTF], [dtestDataTF, dclassesTestTF]], [[ltrainDataTF, lclassesTrainTF], [ltestDataTF, lclassesTestTF]]
 def split_dataset(index, random_state, test_ratio=0.2, valid_ratio=0.2):
     index = list(index)
     ix_train, ix_test = train_test_split(index, test_size=test_ratio,
         random_state=random_state)
     ix_train, ix_valid = train_test_split(ix_train,
         test_size=valid_ratio / (1 - test_ratio), random_state=random_state)
     return {'train': ix_train, 'valid': ix_valid, 'test': ix_test}
def create_sets(img_dir, train_set_proportion=.6, test_set_proportion=.2, val_set_proportion=.2):
    '''Split a list of image files up into training, testing and validation sets.'''

    if os.path.isfile(img_dir+ 'imgs.list'):
        baseimgfilenames = pickle.load(open(img_dir+'imgs.list','rb'))
    else:
        imgfilenames = glob.glob(img_dir + '*.jpg')
        baseimgfilenames = [os.path.basename(f) for f in imgfilenames]

    train,val = train_test_split(np.arange(len(baseimgfilenames)),
                                       train_size=train_set_proportion+test_set_proportion,
                                       test_size=val_set_proportion,
                                       random_state=1)

    train_test_prop = train_set_proportion + test_set_proportion
    train,test = train_test_split(train,
                                  train_size=train_set_proportion/train_test_prop,
                                  test_size=test_set_proportion/train_test_prop,
                                  random_state=1)

    trainfiles = [baseimgfilenames[i] for i in train]
    valfiles = [baseimgfilenames[i] for i in val]
    testfiles = [baseimgfilenames[i] for i in test]

    return trainfiles, valfiles,testfiles
def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  x_train, x_val, y_train, y_val = train_test_split(
      x_train, y_train, test_size=0.2, random_state=42)
  val_monitor = learn.monitors.ValidationMonitor(
      x_val, y_val, early_stopping_rounds=200)

  # classifier with early stopping on training data
  classifier1 = learn.DNNClassifier(
      hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model/')
  classifier1.fit(x=x_train, y=y_train, steps=2000)
  score1 = metrics.accuracy_score(y_test, classifier1.predict(x_test))

  # classifier with early stopping on validation data, save frequently for
  # monitor to pick up new checkpoints.
  classifier2 = learn.DNNClassifier(
      hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model_val/',
      config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
  classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor])
  score2 = metrics.accuracy_score(y_test, classifier2.predict(x_test))

  # In many applications, the score is improved by using early stopping
  print('score1: ', score1)
  print('score2: ', score2)
  print('score2 > score1: ', score2 > score1)
def create_sets(img_dir, train_set_proportion=.6, test_set_proportion=.2, val_set_proportion=.2):
    '''Split a list of image files up into training, testing and validation sets.'''

    imgfilenames = glob.glob(img_dir + '*.jpg')
    baseimgfilenames = [os.path.basename(f) for f in imgfilenames]

    if train_set_proportion + test_set_proportion < 1:
        train,val = train_test_split(np.arange(len(baseimgfilenames)),
                                           train_size=train_set_proportion+test_set_proportion,
                                           test_size=val_set_proportion,
                                           random_state=1) 
    else:
        train = np.arange(len(baseimgfilenames))
        val = []

    train_test_prop = train_set_proportion + test_set_proportion
    train,test = train_test_split(train,
                                  train_size=train_set_proportion/train_test_prop,
                                  test_size=test_set_proportion/train_test_prop,
                                  random_state=1)

    trainfiles = [baseimgfilenames[i] for i in train]
    testfiles = [baseimgfilenames[i] for i in test]
    valfiles = [baseimgfilenames[i] for i in val]

    return trainfiles, valfiles,testfiles
Example #28
0
def train_lsvr():
    train_sys = np.load('fc2_train_sys.npy')
    test_sys =  np.load('fc2_test_sys.npy')
    # from sklearn.preprocessing import StandardScaler
    # sle = StandardScaler()
    # train_sys = sle.fit_transform(train_sys)
    # test_sys  = sle.fit_transform(test_sys)
    
    y = np.load('data/y_train.npy')
    from sklearn import svm
    #from sklearn.metrics import mean_squared_error
    from sklearn.ensemble import RandomForestRegressor
    lsvr = svm.SVR(C=0.1) # 0.045
    #lsvr = RandomForestRegressor(n_estimators = 100)
    train_sys, val_sys, train_y_sys, val_y_sys = train_test_split(train_sys, y[:,0])
    lsvr.fit(train_sys, train_y_sys)
    #print mean_squared_error(val_y_sys, l
    pred_systole = lsvr.predict(val_sys)
    cdf_val = real_to_cdf(val_y_sys)
    cdf_pred_systole = real_to_cdf(pred_systole)
    crps_val = crps(cdf_val, cdf_pred_systole)
    print('CRPS(val sys) = {0}'.format(crps_val))

    train_dia = np.load('fc2_train_dia.npy')
    test_dia  = np.load('fc2_test_dia.npy')

    train_dia, val_dia, train_y_dia, val_y_dia = train_test_split(train_dia, y[:,1])
    lsvr.fit(train_dia, train_y_dia)

    pred_dia = lsvr.predict(val_dia)
    cdf_val_dia = real_to_cdf(val_y_dia)
    cdf_pred_dia = real_to_cdf(pred_dia)
    crps_val = crps(cdf_val_dia, cdf_pred_dia)
    print('CRPS(val dia) = {0}'.format(crps_val))
def split_data(x_train, y_train):
    """
    Given training data cropped from the original dataset by create_training_set.py, split this data up into training, cross-validation, and test data.

    INPUTS:
    x_train = Features cropped from original dataset
    y_train = Labels manually inputed from x_train

    OUTPUTS:
    new_x_train = New training data randomly selected from x_train
    new_x_crossval = Cross-validation samples from x_train
    new_x_test = Test samples from x_train
    new_y_train = Training labels
    new_y_crossval = Cross-validation labels
    new_y_test = Testing labels
    """
    new_x_train, new_x_test, new_y_train, new_y_test \
     = cross_val.train_test_split(x_train,
                                  y_train,
                                  test_size=0.3,
                                  random_state=53)
    new_x_crossval, new_x_test, new_y_crossval, new_y_test \
     = cross_val.train_test_split(new_x_test,
                                  new_y_test,
                                  test_size=0.5,
                                  random_state=41)
    return new_x_train, new_x_crossval, new_x_test, new_y_train, \
            new_y_crossval, new_y_test
Example #30
0
def iris_demo():
    # load the iris dataset
    iris = load_iris()
    X = iris['data']
    y_labels = iris['target']

    lb = LabelBinarizer()
    y = lb.fit_transform(y_labels)

    # split into training, validation and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.25,
                                                        random_state=RANDOM_STATE)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                          test_size=0.25,
                                                          random_state=RANDOM_STATE)

    # train the neural net
    print("Building logistic regression classifier to classify iris data")
    nn = pynn.ArtificialNeuralNet([X_train.shape[1], 20, y_train.shape[1]])
    print("Training")
    nn.fit(X_train, y_train, X_valid, y_valid,
           batch_size=20, n_epochs=20, learning_rate=0.05,
           random_state=RANDOM_STATE)

    y_pred = nn.predict(X_test)

    print("iris accuracy: {}%".format(
        accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
Example #31
0
x_dataset = []
Y = []
for filename in gb.glob('datasets/imagenes/*.ppm'):
    img = misc.imread(filename)
    x_dataset.append(img)
    Y.append(filename)

## 2) preprocess
X = []
for img in x_dataset:
    # 2.1. Convertir a escala de grises
    gray_img = color.rgb2gray(img)

    # 2.2. Ecualizar imagen
    eq = exposure.equalize_hist(gray_img)
    # 2.3. Algun filtro
    blur = gaussian(eq, sigma=1)
    # binarizar imagen
    bin_img = (blur > blur.mean()).astype(int)
    plot_image(blur)
    plot_image(bin_img)
    # 2.4. Aplanar imagen
    X.append(np.reshape(bin_img, [-1]))
    ipdb.set_trace()

## 3) Dividir en training, test
xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                Y,
                                                test_size=0.2,
                                                random_state=42)
Example #32
0
@author: hp
"""

import numpy as np
from sklearn import preprocessing, cross_validation, neighbors, svm
import pandas as pd

df = pd.read_csv('breast-cancer-wisconsin.txt')

df.replace('?', -99999, inplace=True)
df.drop(['id'], 1, inplace=True)

#print( df.head() )

X = np.array( df.drop(['class'],1) )
y = np.array( df['class'] )

X_train , X_test, y_train, y_test = cross_validation.train_test_split(X,y, test_size=0.2 )

clf = svm.SVC(n_jobs =-1)
clf.fit( X_train , y_train )

accuracy = clf.score(X_test, y_test)

#print( accuracy )

example_measures = np.array([2,7,10,10,7,10,4,9,4])
example_measures = example_measures.reshape(len( example_measures ),-1)
predection = clf.predict(example_measures)

print( predection )
depen = dataset.iloc[:, 3].values
"""
#Taking Car Of Missing Data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) #Grab the null values
imputer = imputer.fit(inpen[:, 1:3]) #Get the mean of all the other values in each of these columns
inpen[:, 1:3] = imputer.transform(inpen[:, 1:3]) # Set the null vals to the calculated mean

#Encoding Categorical Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
encoder_country = LabelEncoder()
inpen[:, 0] = encoder_country.fit_transform(inpen[:,0])
onehotencoder = OneHotEncoder(categorical_features= [0])
inpen = onehotencoder.fit_transform(inpen).toarray()
encoder_purchased = LabelEncoder()
depen = encoder_purchased.fit_transform(depen)
"""

#Splitting The Dataset Into The Training And Testing Set
from sklearn.cross_validation import train_test_split

inpen_train, inpen_test, depen_train, depen_test = train_test_split(
    inpen, depen, test_size=0.2)
"""
#Feature Scaling
from sklearn.preprocessing import StandardScaler
stdScale_inpen = StandardScaler()
inpen_train = stdScale_inpen.fit_transform(inpen_train) #We need to fit the training set before we transform it
inpen_test = stdScale_inpen.transform(inpen_test) #We do not need to fit the test set before we transform it because it's already fitted to the training set
"""
Example #34
0
                    learning_rate=0.01,
                    n_estimators=550,
                    subsample=0.5,
                    colsample_bytree=0.5,
                    seed=0)
clf.fit(train_x, train_y)
test_y = clf.predict_proba(test_x)[:, 1]
makesubmission(test_y)

print 'done'
# sn = model2.degit_network(units=args.units,gpu=args.gpu)
sn = model.shoot_network(units=args.units, gpu=args.gpu)

if args.train > 0:
    print 'predict validation test set'
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        train_x, train_y, test_size=0.2, random_state=0)

    clf = svm.SVC()
    clf.fit(X_train, y_train)
    pred = clf.fit(X_test)

    print 'svm logloss', sn.logloss(y_test, pred)

    sn.fit(X_train,
           y_train,
           n_epoch=args.epoch,
           batchsize=args.batchsize,
           save=False)
    pred = sn.predict(X_test)
    print 'logloss of test set:', sn.logloss(y_test, pred)
Example #35
0
glass_data.loc[glass_data.Type.between(1, 4), 'binary'] = 0
glass_data.loc[glass_data.Type.between(5, 7), 'binary'] = 1
print glass_data.head()
#print glass_data[(glass_data.Type > 2) & (glass_data.Type < 7)]

#part 2
X = glass_data[[
    'Ref Index', 'Sodium', 'Mag', 'Alum', 'Silicon', 'Potas', 'Calcium',
    'Barium', 'Iron', 'Type'
]]
y = glass_data.binary

print X.shape
print y.shape
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)

#part 3
#fit the model
LR = LogisticRegression()
LR.fit(X_train, y_train)
B1 = LR.coef_[0][0]
B0 = LR.intercept_[0]
print B1, "B1", B0, "B0"
print np.exp(B1), "significant? Yes, I believe so."

prob = LR.score(X_test, y_test)
print prob, "model accuracy score"
#make predictions
preds = LR.predict(X_test)
Example #36
0
#to give column names
dataset.columns = []

#Diving the dataset into independent and dependent variables
X = dataset.iloc[:, :].values
Y = dataset.iloc[:, :].values

# Column names
list(dataset)

#Splitting the data into training set and validation set
from sklearn.cross_validation import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

#Correlation Matrix
corr_matrix = train_X.corr()
f, ax = plt.subplots(figsize=(16, 10))
ax = sns.heatmap(corr_matrix)
ax.set_title("correlation between all features")
figure = ax.get_figure()

#Do feature scaling if required.
#Convert into categorical variable if required

#Model
from sklearn.linear_model import LinearRegression
Example #37
0
# In[17]:

from sklearn.cross_validation import cross_val_score

# In[18]:

from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# In[22]:

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

# In[23]:

# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.cross_validation import KFold
kf = KFold(25, n_folds=5, shuffle=False)

# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations',
Example #38
0
def load_data(dataset,
              nframes=13,
              features='MFCC',
              scaling='normalize',
              pca_whiten=0,
              cv_frac=0.2,
              dataset_name='timit',
              speakers=False,
              numpy_array_only=False):
    """ 
    params:
     - dataset: folder
     - nframes: number of frames to replicate/pad
     - features: 'MFCC' (13 + D + A = 39) || 'fbank' (40 coeffs filterbanks) 
                 || 'gamma' (50 coeffs gammatones)
     - scaling: 'none' || 'unit' (put all the data into [0-1])
                || 'normalize' ((X-mean(X))/std(X))
                || student ((X-mean(X))/std(X, deg_of_liberty=1))
     - pca_whiten: not if 0, MLE if < 0, number of components if > 0
     - cv_frac: cross validation fraction on the train set
     - dataset_name: prepended to the name of the serialized stuff
     - speakers: if true, Ys (labels) are speakers instead of phone's states
    """
    params = {
        'nframes_mfcc':
        nframes,
        'features':
        features,
        'scaling':
        scaling,
        'pca_whiten_mfcc_path':
        'pca_' + str(pca_whiten) + '.pickle' if pca_whiten else 0,
        'cv_frac':
        cv_frac,
        'theano_borrow?':
        BORROW,
        'use_caching?':
        USE_CACHING,
        'train_classifiers_1_frame?':
        TRAIN_CLASSIFIERS_1_FRAME,
        'train_classifiers?':
        TRAIN_CLASSIFIERS,
        'dataset_name':
        dataset_name,
        'speakers?':
        speakers
    }
    with open('prep_' + dataset_name + '_params.json', 'w') as f:
        f.write(json.dumps(params))
    suffix = scaling
    if speakers:
        suffix += "_spkr"

    def prep_and_serialize():
        [train_x, train_y, test_x, test_y, dev_x,
         dev_y] = prep_data(dataset,
                            nframes=nframes,
                            features=features,
                            scaling=scaling,
                            pca_whiten=pca_whiten,
                            dataset_name=dataset_name,
                            speakers=speakers,
                            dev=(cv_frac == 'fixed'))
        with open(
                prefix_path + 'train_x_' + dataset_name + '_' + features +
                str(nframes) + suffix + '.npy', 'wb') as f:
            np.save(f, train_x)
        with open(
                prefix_path + 'train_y_' + dataset_name + '_' + features +
                str(nframes) + suffix + '.npy', 'wb') as f:
            np.save(f, train_y)
        with open(
                prefix_path + 'test_x_' + dataset_name + '_' + features +
                str(nframes) + suffix + '.npy', 'wb') as f:
            np.save(f, test_x)
        with open(
                prefix_path + 'test_y_' + dataset_name + '_' + features +
                str(nframes) + suffix + '.npy', 'wb') as f:
            np.save(f, test_y)
        if dev_x != None:
            with open(
                    prefix_path + 'dev_x_' + dataset_name + '_' + features +
                    str(nframes) + suffix + '.npy', 'wb') as f:
                np.save(f, dev_x)
        if dev_y != None:
            with open(
                    prefix_path + 'dev_y_' + dataset_name + '_' + features +
                    str(nframes) + suffix + '.npy', 'wb') as f:
                np.save(f, dev_y)
        print ">>> Serialized all train/test tables"
        return [train_x, train_y, test_x, test_y, dev_x, dev_y]

    if USE_CACHING:
        try:  # try to load from serialized filed, beware
            with open(
                    prefix_path + 'train_x_' + dataset_name + '_' + features +
                    str(nframes) + suffix + '.npy', 'rb') as f:
                train_x = np.load(f)
            with open(
                    prefix_path + 'train_y_' + dataset_name + '_' + features +
                    str(nframes) + suffix + '.npy', 'rb') as f:
                train_y = np.load(f)
            with open(
                    prefix_path + 'test_x_' + dataset_name + '_' + features +
                    str(nframes) + suffix + '.npy', 'rb') as f:
                test_x = np.load(f)
            with open(
                    prefix_path + 'test_y_' + dataset_name + '_' + features +
                    str(nframes) + suffix + '.npy', 'rb') as f:
                test_y = np.load(f)
            if cv_frac == 'fixed':
                with open(
                        prefix_path + 'dev_x_' + dataset_name + '_' +
                        features + str(nframes) + suffix + '.npy', 'rb') as f:
                    dev_x = np.load(f)
                with open(
                        prefix_path + 'dev_y_' + dataset_name + '_' +
                        features + str(nframes) + suffix + '.npy', 'rb') as f:
                    dev_y = np.load(f)
        except:  # do the whole preparation (normalization / padding)
            print "doing the preparation because no serialized data found"
            [train_x, train_y, test_x, test_y, dev_x,
             dev_y] = prep_and_serialize()
    else:
        [train_x, train_y, test_x, test_y, dev_x, dev_y] = prep_and_serialize()

    if cv_frac == 'fixed':
        X_train = train_x
        y_train = train_y
        X_validate = dev_x
        y_validate = dev_y
    else:
        from sklearn import cross_validation
        X_train, X_validate, y_train, y_validate = cross_validation.train_test_split(
            train_x, train_y, test_size=cv_frac, random_state=0)

    if numpy_array_only:
        train_set_x = X_train
        train_set_y = np.asarray(y_train, dtype='int32')
        val_set_x = X_validate
        val_set_y = np.asarray(y_validate, dtype='int32')
        test_set_x = test_x
        test_set_y = np.asarray(test_y, dtype='int32')
    else:
        train_set_x = theano.shared(X_train, borrow=BORROW)
        train_set_y = theano.shared(np.asarray(y_train,
                                               dtype=theano.config.floatX),
                                    borrow=BORROW)
        train_set_y = T.cast(train_set_y, 'int32')
        val_set_x = theano.shared(X_validate, borrow=BORROW)
        val_set_y = theano.shared(np.asarray(y_validate,
                                             dtype=theano.config.floatX),
                                  borrow=BORROW)
        val_set_y = T.cast(val_set_y, 'int32')
        test_set_x = theano.shared(test_x, borrow=BORROW)
        test_set_y = theano.shared(np.asarray(test_y,
                                              dtype=theano.config.floatX),
                                   borrow=BORROW)
        test_set_y = T.cast(test_set_y, 'int32')

    return [(train_set_x, train_set_y), (val_set_x, val_set_y),
            (test_set_x, test_set_y)]
import matplotlib.pyplot as plt
import pandas as pd


#Importing the dataset
dataset = pd.read_csv('Salary_Data.csv')
print(dataset)
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 1].values
#print(X)
#print(Y)


#Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 1/3, random_state = 0)
#print(X_train)
#print(X_test)
#print(Y_train)
#print(Y_test)


#Feature Scaling
#Most libraries will take care this step


#Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Example #40
0
>>> y = df['quality']
>>> regressor = LinearRegression()
>>> scores = cross_val_score(regressor, X, y, cv=5)
>>> print scores.mean(), scores
0.290041628842 [ 0.13200871 0.31858135 0.34955348 0.369145
0.2809196 ]

#Stochastic Gradient Descent
>>> import numpy as np
>>> from sklearn.datasets import load_boston
>>> from sklearn.linear_model import SGDRegressor
>>> from sklearn.cross_validation import cross_val_score
>>> from sklearn.preprocessing import StandardScaler
>>> from sklearn.cross_validation import train_test_split
>>> data = load_boston()
>>> X_train, X_test, y_train, y_test = train_test_split(data.data,
data.target)

>>> X_scaler = StandardScaler()
>>> y_scaler = StandardScaler()
>>> X_train = X_scaler.fit_transform(X_train)
>>> y_train = y_scaler.fit_transform(y_train)
>>> X_test = X_scaler.transform(X_test)
>>> y_test = y_scaler.transform(y_test)

>>> regressor = SGDRegressor(loss='squared_loss')
>>> scores = cross_val_score(regressor, X_train, y_train, cv=5)
>>> print 'Cross validation r-squared scores:', scores
>>> print 'Average cross validation r-squared score:', np.mean(scores)
>>> regressor.fit_transform(X_train, y_train)
>>> print 'Test set r-squared score', regressor.score(X_test, y_test)
Example #41
0
	dt.fit(training_feature,training_target)
	print dt.score(training_feature, training_target)
	print dt.score(test_feature, test_target)
	print dt.best_estimator_
	
	
	parameters=[{'n_estimators':values}]
	dt=grid_search.GridSearchCV(ensemble.ExtraTreesClassifier(),parameters,cv=5,scoring="accuracy",n_jobs=6)
	dt.fit(training_feature,training_target)
	print dt.score(training_feature, training_target)
	print dt.score(test_feature, test_target)
	print dt.best_estimator_
	
	
	'''
	
	

num=0
while num<10:
	train,test=train_test_split(x,test_size=int(x.shape[0]*0.15))

	feature= train[0:,1:]
	target= train[0:,0]

	feature_test=test[0:,1:]
	target_test= test[0:,0]
	random_forest(feature, target, feature_test,target_test)
	num=num+1

Example #42
0
        count=count+1
       
count=0
for i in range(0,len(negdataset)):
    for j in range(i+1,len(negdataset)):
        negTol[count]=np.abs(negdataset[i]-negdataset[j])
        count=count+1   
#打乱,构造数据集
np.random.shuffle(posTol)  
np.random.shuffle(negTol)
pos=np.array(posTol[0:poSize])
neg=np.array(posTol[0:neSize])
dataset=np.concatenate((pos,neg),axis=0)

#split the dataset and label:
X_train, X_test, y_train, y_test = train_test_split(dataset, label, random_state=0)
#np.transpose(label)
print(mt.sqrt(dataset.shape[1]))
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred_class = lr.predict(X_test)
# calculate accuracy
print ("acc:",metrics.accuracy_score(y_test, y_pred_class))
#计算空准确率
print("null acc:",max(y_test.mean(), 1-y_test.mean()))
# 混淆矩阵
print ("混淆矩阵:",metrics.confusion_matrix(y_test, y_pred_class))
#scores = cross_val_score(lr, dataset, label,cv=2,scoring='roc_auc')
#result=scores.mean() 
#print(result) 
elapsed = (time.clock() - start)
def trainencoder(
      sources = ("image_vects", "word_vects")
    , sources_k = ("image_vects_k", "word_vects_k")
    , batch_size=128
    , embedding_dim=300
    , n_captions=5
    , n_sbu=None
    , separate_emb=False
    , test_size=1000 # per dataset
    , mode='dev'
    ):
    if mode=="coco120k+flickr38k":
        XYsplit_cum = ([], [], [], [])
        xyloaders = [
              "cocoXYFilenames(dataType='train2014')"
            , "cocoXYFilenames(dataType='val2014')"
            , "flickrXYFilenames(dataType='8k')"
            , "flickrXYFilenames(dataType='30k')"
            ]
        ntrains = [80000, 40000, 8000, 30000]

        for xyloader, ntrain in zip(xyloaders, ntrains):
            X, Y, _ = eval(xyloader)
            XYsplit = train_test_split(X, Y, train_size=ntrain)
            for i in range(len(XYsplit)):
                XYsplit_cum[i].extend(XYsplit[i])

        trX, teX, trY, teY = XYsplit_cum
    else:
        trX, teX, trY, teY = coco(mode=mode, n_captions=n_captions, test_size=test_size)
        if n_sbu:
            sbutrX, sbuteX, sbutrY, sbuteY = sbu(mode=mode, test_size=test_size)
            pairs = (
                  (trX, sbutrX)
                , (teX, sbuteX)
                , (trY, sbutrY)
                , (teY, sbuteY)
                )

            for coco_data, sbu_data in pairs:
                if isinstance(coco_data, list):
                    coco_data.extend(sbu_data)

    print("n_train: %d" % len(trX))
    print("n_test: %d" % len(teX))

    # # # # # # # # # # #
    # Modeling Building #
    # # # # # # # # # # #

    s = Encoder(
          image_feature_dim=4096
        , embedding_dim=embedding_dim
        , biases_init=Constant(0.)
        , weights_init=Uniform(width=0.08)
        )
    s.initialize()

    image_vects = tensor.matrix(sources[0]) # named to match the source name
    word_vects = tensor.tensor3(sources[1]) # named to match the source name
    image_vects_k = tensor.matrix(sources_k[0]) # named to match the contrastive source name
    word_vects_k = tensor.tensor3(sources_k[1]) # named to match the contrastive source name

    # image_vects.tag.test_value = np.zeros((2, 4096), dtype='float32')
    # word_vects.tag.test_value = np.zeros((2, 15, 50), dtype='float32')
    # image_vects_k.tag.test_value = np.zeros((2, 4096), dtype='float32')
    # word_vects_k.tag.test_value = np.zeros((2, 15, 50), dtype='float32')

    # learned image embedding, learned sentence embedding
    lim, ls = s.apply(image_vects, word_vects)

    # learned constrastive im embedding, learned contrastive s embedding
    lcim, lcs = s.apply(image_vects_k, word_vects_k)

    # identical cost code thanks to Ryan Kiros
    # https://github.com/youralien/skip-thoughts/blob/master/eval_rank.py
    lim = l2norm(lim)
    lcim = l2norm(lcim)
    ls = l2norm(ls)
    lcs = l2norm(lcs)

    margin = 0.2 # alpha term should not be more than 1

    cost_im = margin - (lim * ls).sum(axis=1) + (lim * lcs).sum(axis=1)
    cost_im = cost_im * (cost_im > 0.) # this is like the max(0, pairwise-ranking-loss)
    cost_im = cost_im.sum(0)

    cost_s = margin - (ls * lim).sum(axis=1) + (ls * lcim).sum(axis=1)
    cost_s = cost_s * (cost_s > 0.) # this is like max(0, pairwise-ranking-loss)
    cost_s = cost_s.sum(0)

    cost = cost_im + cost_s
    cost.name = "pairwise_ranking_loss"

    # function(s) to produce embedding
    if separate_emb:
        img_encoder = theano.function([image_vects], lim)
        txt_encoder = theano.function([word_vects], ls)
    f_emb = theano.function([image_vects, word_vects], [lim, ls])

    if n_sbu:
        sbuname = "sbu%d+" % n_sbu
    else:
        sbuname = ''
    name = "%sproject1.%s.jointembedder" % (sbuname, mode)
    savename = MODEL_FILES_DIR + name

    def save_function(self):
        if separate_emb:
            ModelIO.save(
                  img_encoder
                , savename + "_Img")
            ModelIO.save(
                  txt_encoder
                , savename + "_Txt")
        ModelIO.save(f_emb, savename)
        print "Similarity Embedding function(s) saved while training"

    def rank_function(stream):
        images, captions, _0, _1 = stream.get_epoch_iterator().next()
        image_embs, caption_embs = f_emb(images, captions)
        ModelEval.ImageSentenceRanking(image_embs, caption_embs)

    def rank_coco(self=None):
        # Get 1000 images / captions to test rank
        stream = DataETL.getFinalStream(teX, teY, sources=sources,
                            sources_k=sources_k, batch_size=test_size,
                            shuffle=True)
        print "COCO test"
        rank_function(stream)

    def rank_sbu(self=None):
        stream = DataETL.getFinalStream(sbuteX, sbuteY, sources=sources,
                            sources_k=sources_k, batch_size=test_size,
                            shuffle=True)
        print "SBU test"
        rank_function(stream)

    def rank_em(self=None):
        rank_coco()
        if n_sbu:
            rank_sbu()

    cg = ComputationGraph(cost)

    # # # # # # # # # # #
    # Modeling Training #
    # # # # # # # # # # #

    algorithm = GradientDescent(
          cost=cost
        , parameters=cg.parameters
        , step_rule=Adam(learning_rate=0.0002)
        )
    main_loop = MainLoop(
          model=Model(cost)
        , data_stream=DataETL.getFinalStream(trX, trY, sources=sources,
              sources_k=sources_k, batch_size=batch_size)
        , algorithm=algorithm
        , extensions=[
              DataStreamMonitoring(
                  [cost]
                , DataETL.getFinalStream(trX, trY, sources=sources,
                      sources_k=sources_k, batch_size=batch_size, shuffle=True)
                , prefix='train')
            , DataStreamMonitoring(
                  [cost]
                , DataETL.getFinalStream(teX, teY, sources=sources,
                      sources_k=sources_k, batch_size=batch_size, shuffle=True)
                , prefix='test')
            , UserFunc(save_function, after_epoch=True)
            , UserFunc(rank_em, after_epoch=True)
            , Printing()
            , LogToFile('logs/%s.csv' % name)
            ]
        )
    main_loop.run()
import tensorflow as tf
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelBinarizer

digits = load_digits()
X = digits.data
y = digits.target
y = LabelBinarizer().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

input_size = 64
output_size = 10
norm = False
hidden_units1 = 100
hidden_units2 = 50


def fc_layer_without_bn(x_input, num_units, activation, is_training):
    layer = tf.layers.dense(x_input, num_units, use_bias=False)
    layer = activation(layer)
    return layer


def compute_accuracy(v_xs, v_ys):
    global prediction  #先把prediction定义为全局变量

    y_pre = sess.run(prediction, feed_dict={
        xs: v_xs,
        on_train: False
    })  #生成预测值(概率),10分类,所以一个样本是10列概率
import pickle
import numpy
numpy.random.seed(42)

### the words (features) and authors (labels), already largely processed
### these files should have been created from the previous (Lesson 10) mini-project.
words_file = "/Users/dadda/Dropbox (MIT)/Online Courses/Intro to ML/ud120-projects-master/text_learning/your_word_data.pkl"
authors_file = "/Users/dadda/Dropbox (MIT)/Online Courses/Intro to ML/ud120-projects-master/text_learning/your_email_authors.pkl"
word_data = pickle.load(open(words_file, "r"))
authors = pickle.load(open(authors_file, "r"))

### test_size is the percentage of events assigned to the test set (remainder go into training)
### feature matrices changed to dense representations for compatibility with classifier
### functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()

### a classic way to overfit is to use a small number
### of data points and a large number of features
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]

### your code goes here
df_y

# In[14]:

df_x = np.array(df_x)
df_y = np.array(df_y)

# In[15]:

df_x.shape

# In[16]:

# test train split# test t
x_train, x_test, y_train, y_test = train_test_split(df_x,
                                                    df_y,
                                                    test_size=0.2,
                                                    random_state=4)
# done with preprocessing

# In[17]:

#CNN model#CNN mod
model = Sequential()
# 32 filter 3*3 size
model.add(
    Convolution2D(32,
                  3,
                  data_format='channels_last',
                  activation='relu',
                  input_shape=(28, 28, 1)))
# reduce number of parameters by getting imporatant params
Example #47
0
from utils.dataloaders import load_wassa, sentence_dataset
from utils.early_stopping import Early_stopping
from utils.load_embeddings import load_word_vectors
from utils.nlp import twitter_preprocessor
from utils.training import class_weigths, epoch_summary, save_checkpoint

# load dataset
config = ConfLangModel
dataset = 'emotion2M'
name = 'emotion_with_2M'

data = sentence_dataset(os.path.join(DATA_DIR, dataset, "emotion_final.txt"))
y = np.zeros(len(data))

train_data, val_data, _, _ = train_test_split(data, y,
                                              test_size=0.2,
                                              random_state=13)
# train_data = train_data[:1000]
# val_data = val_data[:100]
#####################################################################
# Define Dataloaders
#####################################################################

# Prosoxh! to emotion dataset einai hdh PREPROCESSED me ekphrasis!

# preprocessor = twitter_preprocessor()
preprocessor = None
if preprocessor is None:
    train_name = "train_simple_split_{}".format(dataset)
    val_name = "valid_simple_split_{}".format(dataset)
else:

### load up some practice data with outliers in it
ages = pickle.load( open("practice_outliers_ages.pkl", "r") )
net_worths = pickle.load( open("practice_outliers_net_worths.pkl", "r") )



### ages and net_worths need to be reshaped into 2D numpy arrays
### second argument of reshape command is a tuple of integers: (n_rows, n_columns)
### by convention, n_rows is the number of data points
### and n_columns is the number of features
ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
from sklearn.cross_validation import train_test_split
ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42)

### fill in a regression here!  Name the regression object reg so that
### the plotting code below works, and you can see what your regression looks like

from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(ages_train,net_worths_train)

print 'The Slope Of The Regression Line Is: ',reg.coef_

print 'The Regression Score On Test Data: ', reg.score(ages_test, net_worths_test)

try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
Example #49
0
# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=300,
                             n_jobs=-1,
                             class_weight='balanced_subsample')

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

from sklearn.metrics import (recall_score, precision_score)
print precision_score(labels_test, pred, average='binary')
print recall_score(labels_test, pred, average='binary')

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
Example #50
0
        X0[s2, :] = X[i, :]
        s2 = s2 + 1

Y0 = Y0[0:s2]
X0 = X0[0:s2]
Y1 = Y1[0:s1]
X1 = X1[0:s1]

#Ensure the same proportion as positives and negatives categories
total = len(Y0)
partial = len(Y1)
prop = partial / total

## Shuffling the data
X_use, X_discart, Y_use, Y_discart = train_test_split( X0, Y0, \
                                                    test_size = 0.3, \
                                                    train_size = prop, \
                                                    random_state = 90)

## Reconstruct same proportion data--set
X_data = np.concatenate([X1, X_use])
Y_data = np.concatenate([Y1, Y_use])

## Normalization
for i in range(7):
    x_max = max(X_data[:, i])
    x_min = min(X_data[:, i])
    k = float(1 / (x_max - x_min))
    X_data[:, i] = (X_data[:, i] - x_min) * k

# Data prep to trainning such as test
prop = 12000 / len(Y_data)
Example #51
0
def PCAPlot():
    import io, sys
    sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
    tagset = set([])
    tags = ["i2vtags", "mstags", "gotags"]
    #    tags = ["gotags"]
    for tag in tags:
        for item in anime:
            for st in item[tag]:
                t = tag + "_" + st[0].replace(" ", "_")
                tagset.add(t)
        for item in jpop:
            for st in item[tag]:
                t = tag + "_" + st[0].replace(" ", "_")
                tagset.add(t)
    idtag = list(tagset)
    idtag.sort()
    idtag = ["anime/jpop"] + idtag
    tagid = {}
    for id, tag in enumerate(idtag):
        tagid[tag] = id
    feature = np.zeros((len(jpop) * 2, len(idtag) - 1))
    cnt = 0
    for item in anime[:len(jpop)]:
        for tag in tags:
            for st in item[tag]:
                t = tag + "_" + st[0].replace(" ", "_")
                feature[cnt][tagid[t] - 1] = st[1]
        cnt += 1
    for item in jpop:
        for tag in tags:
            for st in item[tag]:
                t = tag + "_" + st[0].replace(" ", "_")
                feature[cnt][tagid[t] - 1] = st[1]
        cnt += 1
    from sklearn.decomposition import PCA
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    pca = PCA(n_components=2)
    xtr = pca.fit_transform(feature)
    plt.scatter(xtr[:len(jpop), 0],
                xtr[:len(jpop), 1],
                color="red",
                label="anime")
    plt.scatter(xtr[len(jpop):, 0],
                xtr[len(jpop):, 1],
                color="blue",
                label="jpop")
    plt.legend()
    plt.savefig("pca.png")
    plt.show()
    target = [0] * len(jpop) + [1] * len(jpop)
    xtr1, xte1, ytr1, yte1 = train_test_split(feature[:len(jpop)],
                                              [0] * len(jpop),
                                              test_size=0.2)
    xtr2, xte2, ytr2, yte2 = train_test_split(feature[len(jpop):],
                                              [1] * len(jpop),
                                              test_size=0.2)
    xtr = list(xtr1) + list(xtr2)
    xte = list(xte1) + list(xte2)
    ytr = list(ytr1) + list(ytr2)
    yte = list(yte1) + list(yte2)
    lda = LinearDiscriminantAnalysis()
    ytrp = lda.fit_transform(xtr, ytr)
    ytep = lda.transform(xte)
    print(lda.score(xtr, ytr), lda.score(xte, yte))
    plt.subplot(2, 1, 1)
    plt.hist(ytrp[:len(ytrp) / 2],
             normed=True,
             bins=50,
             alpha=0.3,
             label="anime",
             color="red")
    plt.hist(ytrp[len(ytrp) / 2:],
             normed=True,
             bins=50,
             alpha=0.3,
             label="jpop",
             color="blue")
    plt.xlabel("train")
    plt.legend()
    plt.subplot(2, 1, 2)
    plt.hist(ytep[:len(ytep) / 2],
             normed=True,
             bins=50,
             range=(-20, 20),
             alpha=0.3,
             label="anime",
             color="red")
    plt.hist(ytep[len(ytep) / 2:],
             normed=True,
             bins=50,
             range=(-20, 20),
             alpha=0.3,
             label="jpop",
             color="blue")
    plt.xlabel("test")
    plt.legend()
    plt.savefig("lda.png")
    plt.show()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 17 20:22:17 2017

@author: rishi
"""
#just copy and paste the content and change the vairable indexs as per your data
import numpy as np; 
import matplotlib.pyplot as plt;
import pandas as pd;

dataset=pd.read_csv('Data.csv')
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,3].values

from sklearn.cross_validation import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state= 0)
Example #53
0
def makeCorpus():
    import io, sys
    sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
    corp_anime = []
    corp_jpop = []

    tagset = set([])
    tag = "gotags"
    for item in anime:
        if (len(item[tag]) > 0):
            pack = []
            for st in item[tag]:
                st = st[0].replace(" ", "")
                tagset.add(st)
                pack.append(st)
            corp_anime.append(" ".join(pack))
    for item in jpop:
        if (len(item[tag]) > 0):
            pack = []
            for st in item[tag]:
                st = st[0].replace(" ", "")
                tagset.add(st)
                pack.append(st)
            corp_jpop.append(" ".join(pack))
    trf = TfidfVectorizer(max_df=0.9, min_df=5)
    trf.fit(corp_anime + corp_jpop)
    xa = trf.transform(corp_anime).toarray()
    xj = trf.transform(corp_jpop).toarray()
    xjtr, xjte = train_test_split(xj, test_size=0.2)
    xatr, xate = train_test_split(xa,
                                  train_size=xjtr.shape[0],
                                  test_size=xjte.shape[0])
    voc = trf.vocabulary_
    #    print(len(voc))
    #    print(voc)

    xtr = np.vstack((xatr, xjtr))
    #    print(xtr.shape,xatr.shape,xjtr.shape)
    xte = np.vstack((xate, xjte))
    #    print(xte.shape,xate.shape,xjte.shape)

    ytr = [0] * xatr.shape[0] + [1] * xjtr.shape[0]
    yte = [0] * xate.shape[0] + [1] * xjte.shape[0]

    from sklearn.ensemble import RandomForestClassifier
    from sklearn import svm
    import xgboost as xgb
    from sklearn.grid_search import GridSearchCV

    param = {"n_estimators": list(range(25, 35, 1)), "max_depth": [2]}
    rf = RandomForestClassifier()
    '''
    param = {
        "learning_rate" : np.linspace(0.1,0.2,1),
        "n_estimators" : np.arange(500,600,300),
        "min_child_weight" : np.arange(1,2,2),
        "max_depth" : np.arange(3,4,12),
        "gamma" : np.linspace(0.1,0.2,1),
        "subsample" : np.linspace(0.8,0.9,1),
        "colsample_bytree" : np.linspace(0.8,0.9,1)
    }
    print(param)
    rf = xgb.XGBClassifier()
    '''

    grd = GridSearchCV(rf, param)
    grd.fit(xtr, ytr)
    clf = grd.best_estimator_

    imp = clf.feature_importances_
    imps = []
    for ind, inv in enumerate(imp):
        imps.append([inv, ind])
    imps.sort()
    imps.reverse()
    fout = open("importance.txt", "w")
    for item in imps:
        key = [key for key, value in voc.items() if value == item[1]][0]
        fout.write("{0} {1}\n".format(key, item[0]))
    fout.close()

    train_sc = clf.score(xtr, ytr)
    test_sc = clf.score(xte, yte)
    print(train_sc, test_sc)
    print(grd.best_params_)
Example #54
0
@author: Hardikk Madaan
"""

#LEARNING LOSITICS REGRESSION

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv("Social_Network_Ads.csv")
features = dataset.iloc[:, [2, 3]].values
labels = dataset.iloc[:, 4].values

#SPLITTING
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.25, random_state=0)

#FEATURE SCALING
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features_train = sc.fit_transform(features_train)
features_test = sc.transform(features_test)

#FITTING LOGISTIC REGRESSION INTO TRAINING SET
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(features_train, labels_train)

#PREDICTING THE RESULTS
labels_pred = classifier.predict(features_test)
def form_post():
    global team1
    global team2
    team1 = request.form['sel1']
    team2 = request.form['sel2']
    if team1 != '':
        data = pd.read_csv('final_final_dataset.csv')
        data = data[data.MW > 3]
        teamname = team1

        data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam', 'Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
           'HM4','HM5','AM4','AM5','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3'],1, inplace=True)

        # Separate into feature set and target variable
        X_all = data.drop(['FTR'],1)
        y_all = data['FTR']

        cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
        for col in cols:
            X_all[col] = scale(X_all[col])

        X_all.HM1 = X_all.HM1.astype('str')
        X_all.HM2 = X_all.HM2.astype('str')
        X_all.HM3 = X_all.HM3.astype('str')
        X_all.AM1 = X_all.AM1.astype('str')
        X_all.AM2 = X_all.AM2.astype('str')
        X_all.AM3 = X_all.AM3.astype('str')

        def preprocess_features(X):
            ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
            
            # Initialize new output DataFrame
            output = pd.DataFrame(index = X.index)

            # Investigate each feature column for the data
            for col, col_data in X.iteritems():

                # If data type is categorical, convert to dummy variables
                if col_data.dtype == object:
                    col_data = pd.get_dummies(col_data, prefix = col)
                            
                # Collect the revised columns
                output = output.join(col_data)
            
            return output

        X_all = preprocess_features(X_all)
        print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 50, random_state = 2, stratify = y_all)


        def predict_labels(clf, features, target):
            ''' Makes predictions using a fit classifier based on F1 score. '''
            
            # Start the clock, make predictions, then stop the clock
            start = time()
            print("-------------")
            print(type(features))
            print("---------------")
            y_pred = clf.predict(features)
            end = time()
            # Print and return results
            print("Made predictions in {:.4f} seconds.".format(end - start))
            
            return f1_score(target, y_pred, labels=['A','D','H'],average='micro'), sum(target == y_pred) / float(len(y_pred))

        # # TODO: Initialize the classifier
        f1_scorer = make_scorer(f1_score,labels=['A','D','H'],average='micro')
        parameters = { 'learning_rate' : [0.1],
                    'n_estimators' : [40],
                    'max_depth': [3],
                    'min_child_weight': [3],
                    'gamma':[0.4],
                    'subsample' : [0.8],
                    'colsample_bytree' : [0.8],
                    'scale_pos_weight' : [1],
                    'reg_alpha':[1e-5]
                    }  
        #clf.fit(X_train, y_train)
        logistic = LogisticRegression(random_state=42)
        svm = SVC(random_state=912, kernel='rbf')
        
        logistic.fit(X_train,y_train)
        f1, acc = predict_labels(logistic,X_test,y_test)
        print("Logistic Regression --> final F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

        svm.fit(X_train,y_train)
        f1, acc = predict_labels(svm,X_test,y_test)
        print("SVM --> final F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

        clf = xgb.XGBClassifier(seed=2)
        # # TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
        grid_obj = GridSearchCV(clf, scoring=f1_scorer, param_grid=parameters, cv=5)

        # # TODO: Fit the grid search object to the training data and find the optimal parameters
        grid_obj = grid_obj.fit(X_all,y_all)

        # # Get the estimator
        clf = grid_obj.best_estimator_
        #print(clf)

        # # Report the final F1 score for training and testing after parameter tuning
        f1, acc = predict_labels(clf, X_train, y_train)
        print("final F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

        f1, acc = predict_labels(clf, X_test, y_test)
        print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))
        data2 = pd.read_csv('team_dataframe.csv')
        data2 = data2.iloc[30:]
        global teamindex
        teamindex = 122
        for index, row in data2.iterrows():
            if teamname == row['HomeTeam']:
                teamindex = index
        #print(type(X_all.loc[x].to_frame().T))
        #print(X_all.loc[x].to_frame().T)
        winnerlist = clf.predict(X_all.loc[teamindex].to_frame().T)
        print(winnerlist)
        global teamwin
        global hnh
        teamwin = winnerlist[0]
        if teamwin == 'A':
            teamwin = team2
            hnh = "AwayTeam"
        elif teamwin == 'H':
            teamwin = team1
            hnh = "HomeTeam"
        else:
            teamwin = "DRAW!"
            hnh = "The game will be a DRAW"
        print(teamwin)
    else:
        print(team1+"    "+team2)

    return render_template('index.html', text=teamwin,bleh=team2,blehh=hnh)
Example #56
0
def linearSep():
    import io, sys
    sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
    corp_anime = []
    corp_jpop = []

    print(anime[0])
    tagset = set([])
    tag = "i2vtags"
    for item in anime:
        if (len(item[tag]) > 0):
            pack = []
            for st in item[tag]:
                st = st[0].replace(" ", "")
                tagset.add(st)
                pack.append(st)
            corp_anime.append(" ".join(pack))
    for item in jpop:
        if (len(item[tag]) > 0):
            pack = []
            for st in item[tag]:
                st = st[0].replace(" ", "")
                tagset.add(st)
                pack.append(st)
            corp_jpop.append(" ".join(pack))
    trf = TfidfVectorizer(max_df=1.0, min_df=10)
    trf.fit(corp_anime + corp_jpop)
    xa = trf.transform(corp_anime).toarray()
    xj = trf.transform(corp_jpop).toarray()
    xjtr, xjte = train_test_split(xj, test_size=0.2)
    xatr, xate = train_test_split(xa,
                                  train_size=xjtr.shape[0],
                                  test_size=xjte.shape[0])
    voc = trf.vocabulary_

    xtr = np.vstack((xatr, xjtr))
    xte = np.vstack((xate, xjte))
    ytr = [0] * xatr.shape[0] + [1] * xjtr.shape[0]
    yte = [0] * xate.shape[0] + [1] * xjte.shape[0]

    from sklearn import svm
    from sklearn.feature_selection import RFE, RFECV
    param = {"C": [0.01, 0.1, 1, 10, 100, 1000]}
    rf = svm.LinearSVC()
    rfe = RFE(estimator=rf, n_features_to_select=10000, step=10)
    rfe.fit(xtr, ytr)
    xtrt = rfe.transform(xtr)
    xtet = rfe.transform(xte)
    rf.fit(xtrt, ytr)
    print(rf.score(xtrt, ytr))
    print(rf.score(xtet, yte))

    supIndex = rfe.transform(list(range(len(xtr[0]))))[0]

    def getIdKey(id):
        return [key for key, value in voc.items() if value == id][0]

    feats = [[rf.coef_[0][i], getIdKey(v)] for i, v in enumerate(supIndex)]
    feats.sort()
    print("\n".join(list(map(str, feats[0:5]))))
    feats.reverse()
    print("\n".join(list(map(str, feats[0:5]))))
#Dataset provided by SuperDataScience.com

#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import dataset and declare x & y variables
dataset = pd.read_csv('Social_Network_Ads.csv')
x = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

#split the dataset into the training and test sets
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=0)

#feature scaling (not necessary for decision trees, but helps when visualizing the data)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

#fitting classifier to training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10,
                                    criterion='entropy',
                                    random_state=0)
classifier.fit(x_train, y_train)
Example #58
0
train_column = [
    'Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6',
    'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10'
]

X = d[train_column]
Y = d[target_column]
y_true = d[target_column]
a = np.array([100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000])

for i in a:
    #print(i)
    XX = X.iloc[0:i]  #clustering loop for X
    YY = Y.iloc[0:i]
    X_train, X_test, Y_train, Y_test = train_test_split(XX,
                                                        YY,
                                                        test_size=0.30,
                                                        random_state=20)
    #print(XX)
    #print(YY)
    clf = linear_model.LogisticRegression()
    clf.fit(X_train, Y_train)
    #print(clf.predict(XX))
    y_pred = clf.predict(X_test)
    #print(clf.score(XX,YY))

    AA = accuracy_score(Y_test, y_pred)
    print('Accuracy Score:')
    print(AA)

    BB = f1_score(Y_test, y_pred, average='micro')
    print('F1 Score')
Example #59
0
one_hot_encoder = enc.fit(integer_classes)

# First, convert classes to 0-(N-1) integers using label_encoder
num_of_rows = titanic_X.shape[0]
t = label_encoder.transform(titanic_X[:, 0]).reshape(num_of_rows, 1)

# Second, create a sparse matrix with three columns, each one indicating if the instance belongs to the class
new_features = one_hot_encoder.transform(t)

# Add the new features to titanix_X
titanic_X = np.concatenate([titanic_X, new_features.toarray()], axis=1)

# Eliminate converted columns
titanic_X = np.delete(titanic_X, [0], 1)

# Update feature names
feature_names = ['age', 'sex', 'first_class', 'second_class', 'third_class']

# Convert to numerical values
titanic_X = titanic_X.astype(float)
titanic_y = titanic_y.astype(float)

## Check
print(feature_names)
print(titanic_X[0], titanic_y[0])

## Holdout
X_train, X_test, y_train, y_test = train_test_split(titanic_X,
                                                    titanic_y,
                                                    test_size=0.25,
                                                    random_state=33)
test[features] = scl.transform(test[features])

params = {"objective": "reg:linear",
          "eta": 0.3,
          "max_depth": 8,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 300

print("Train a XGBoost model")
val_size = 100000
#train = train.sort(['Date'])
print(train.tail(1)['Date'])
X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
#X_train, X_test = train.head(len(train) - val_size), train.tail(val_size)
dtrain = xgb.DMatrix(X_train[features], np.log(X_train["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[features], np.log(X_test["Sales"] + 1))
dtest = xgb.DMatrix(test[features])
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)

print("Validating")
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

print("Make predictions on the test set")