Python load_svmlight_files Exemples, sklearn.datasets.load_svmlight_files Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_svmlight_format.py Projet : kkuunnddaann/scikit-learn

def test_load_svmlight_files():
    X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2, dtype=np.float32)
    assert_array_equal(X_train.toarray(), X_test.toarray())
    assert_array_equal(y_train, y_test)
    assert_equal(X_train.dtype, np.float32)
    assert_equal(X_test.dtype, np.float32)

    X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3, dtype=np.float64)
    assert_equal(X1.dtype, X2.dtype)
    assert_equal(X2.dtype, X3.dtype)
    assert_equal(X3.dtype, np.float64)

Exemple #2

0

Afficher le fichier

Fichier : base.py Projet : defaultstr/movie_review

def classify_test(feature_list=[], classifiers=[], root_path='./'):
    #load data set
    datasets = []
    for name in feature_list:
        logging.log(logging.DEBUG, 'loading data: %s ...' % name)
        filenames = tuple(['./feature/%s_%s' % (name, tag) for tag in ['train.txt', 'test.txt']])
        X_train, y_train, X_test, y_test = load_svmlight_files(filenames)
        datasets.append((name, X_train, y_train, X_test, y_test))

    #make directory to store results
    result_path = path.join(root_path, 'results')
    if path.exists(result_path):
        assert path.isdir(result_path), 'data must be a directory!'
    else:
        system('mkdir ' + result_path)

    for clf in classifiers:
        for feature in datasets:
            clf_name = clf.__class__.__name__
            feature_name, X_train, y_train, X_test, y_test = feature
            combine_name = feature_name+'_'+clf_name
            info = {}

            logging.log(logging.DEBUG, 'classification test: %s ...' % combine_name)

            logging.log(logging.DEBUG, 'training...')
            t0 = time()
            clf.fit(X_train, y_train)
            t1 = time()
            info['training_time'] = t1-t0

            logging.log(logging.DEBUG, 'testing on training...')
            pred_y = clf.predict(X_train)
            training_acc = accuracy_score(y_train, pred_y)
            logging.log(logging.DEBUG, 'error rate on training set: %f' % (1.0 - training_acc))
            info['training_error'] = 1.0 - training_acc
            fout = open(path.join(result_path, combine_name+'_train.txt'), 'w')
            for y in pred_y:
                print >>fout, y
            fout.close()

            logging.log(logging.DEBUG, 'testing...')
            t0 = time()
            pred_y = clf.predict(X_test)
            t1 = time()
            info['test_time'] = t1-t0
            test_acc = accuracy_score(y_test, pred_y)
            logging.log(logging.DEBUG, 'error rate on test set: %f' % (1.0 - test_acc))
            info['test_error'] = 1.0 - test_acc
            fout = open(path.join(result_path, combine_name+'_test.txt'), 'w')
            for y in pred_y:
                print >>fout, y
            fout.close()

            yield combine_name, feature_name, clf_name, info

Exemple #3

0

Afficher le fichier

Fichier : test4.py Projet : sahuvaibhav/Capstone

def pCoverX(featureFamily):
    os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train")
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    data_df = pd.DataFrame()
    n_guass = 2
    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []
    fileType = featureFamily+'*.gz'
    for file in glob.glob(fileType):
        print(file)
        X_train, y_train, X_test, y_test,X_val, y_val = load_svmlight_files((gzip.open(path+"train\\"+file), gzip.open(path+"test\\"+file),gzip.open(path+"validation\\"+file)))    
        #X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
        X_train = X_train[y_train!=31]
        X_test = X_test[y_test!=31]
        X_val = X_val[y_val!=31]
        y_train = y_train[y_train!=31]
        y_test = y_test[y_test!=31]
        y_val = y_val[y_val!=31]
    #========================= Feature Selection using Variance Thresold =============================================================
        X_train_new, X_test_new , X_val_new = featureSelection(X_train,X_test,X_val,y_train, log=True,tech = 'LinearSVC')
    #========================= Mixture of Guassian ============================================================
        train_prob,test_prob,val_prob = pXoverC(X_train_new, y_train, X_test_new, y_test, X_val_new, y_val, n_guass)
    #========================= Calculating Prior, Posterior and Entropy ============================================================
        prr = prior(y_train)
        train_post = posterior(train_prob,prr)
        train_entropy = entropy(train_post)
        
        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)
    
        test_post = posterior(test_prob,prr)
        test_entropy = entropy(test_post)
    
        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)
        
        val_post = posterior(val_prob,prr)
        val_entropy = entropy(val_post)
    
        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)
        
        train_acc,c_mat = checkAccuracy(train_post,y_train)
        test_acc,c_mat = checkAccuracy(test_post,y_test)
        val_acc,c_mat = checkAccuracy(val_post,y_val)
        temp = pd.DataFrame([[file,train_acc,test_acc,val_acc]])        
        data_df = data_df.append(temp,ignore_index =True)
        
    return train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df

Exemple #4

0

Afficher le fichier

Fichier : test_svmlight_format.py Projet : kkuunnddaann/scikit-learn

def test_load_zero_based_auto():
    data1 = "-1 1:1 2:2 3:3\n"
    data2 = "-1 0:0 1:1\n"

    f1 = BytesIO(data1)
    X, y = load_svmlight_file(f1, zero_based="auto")
    assert_equal(X.shape, (1, 3))

    f1 = BytesIO(data1)
    f2 = BytesIO(data2)
    X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto")
    assert_equal(X1.shape, (1, 4))
    assert_equal(X2.shape, (1, 4))

Exemple #5

0

Afficher le fichier

Fichier : test_svmlight_format.py Projet : kkuunnddaann/scikit-learn

def test_load_with_qid():
    # load svmfile with qid attribute
    data = """
    3 qid:1 1:0.53 2:0.12
    2 qid:1 1:0.13 2:0.1
    7 qid:2 1:0.87 2:0.12"""
    X, y = load_svmlight_file(BytesIO(data), query_id=False)
    assert_array_equal(y, [3, 2, 7])
    assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])
    res1 = load_svmlight_files([BytesIO(data)], query_id=True)
    res2 = load_svmlight_file(BytesIO(data), query_id=True)
    for X, y, qid in (res1, res2):
        assert_array_equal(y, [3, 2, 7])
        assert_array_equal(qid, [1, 1, 2])
        assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])

Exemple #6

0

Afficher le fichier

def test_load_with_qid():
    # load svmfile with qid attribute
    data = b("""
    3 qid:1 1:0.53 2:0.12
    2 qid:1 1:0.13 2:0.1
    7 qid:2 1:0.87 2:0.12""")
    X, y = load_svmlight_file(BytesIO(data), query_id=False)
    assert_array_equal(y, [3, 2, 7])
    assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
    res1 = load_svmlight_files([BytesIO(data)], query_id=True)
    res2 = load_svmlight_file(BytesIO(data), query_id=True)
    for X, y, qid in (res1, res2):
        assert_array_equal(y, [3, 2, 7])
        assert_array_equal(qid, [1, 1, 2])
        assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])

Exemple #7

0

Afficher le fichier

Fichier : learning-to-rank.py Projet : wararaki718/learning_to_rank_lgbm

def main():
    x_train, y_train, x_test, y_test = load_svmlight_files(
        ['data/rank.train', 'data/rank.test'])
    train_query = pd.read_csv('data/rank.train.query',
                              header=None).values.flatten()

    model = lgbm.LGBMRanker(num_leaves=50, n_estimators=200, random_state=42)
    print(model)
    model.fit(x_train,
              y_train,
              group=train_query,
              eval_metric='ndgc',
              eval_at=[1, 3, 5])
    preds = model.predict(x_test)

    print(spearmanr(y_test, preds))
    print('DONE')

Exemple #8

0

Afficher le fichier

Fichier : chi_square_selection.py Projet : junjiek/cmu-exp

def select_feature(trainfilename, testfilename):
    def returnCHI(X, y):
        return chivalue
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))
    
    featureNum = X_train.get_shape()[1]
    chivalue = chi2(X_train, y_train)

    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(chi2, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new= selector.transform(X_test)
        sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)

Exemple #9

0

Afficher le fichier

def load_amazon(source_name, target_name, data_folder=None, verbose=False):
    if data_folder is None:
        data_folder = './data/'
    source_file = data_folder + source_name + '_train.svmlight'
    target_file = data_folder + target_name + '_train.svmlight'
    test_file = data_folder + target_name + '_test.svmlight'
    if verbose:
        print('source file:', source_file)
        print('target file:', target_file)
        print('test file:  ', test_file)

    xs, ys, xt, yt, xt_test, yt_test = load_svmlight_files(
        [source_file, target_file, test_file])
    ys, yt, yt_test = (np.array((y + 1) / 2, dtype=int)
                       for y in (ys, yt, yt_test))

    return xs, ys, xt, yt, xt_test, yt_test

Exemple #10

0

Afficher le fichier

Fichier : chi_square_max_multilabel.py Projet : junjiek/cmu-exp

def select_feature(trainfilename, testfilename):
    def returnCHI(X, y):
        return chivalue
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True)
    
    featureNum = X_train.get_shape()[1]
    chivalue = chi2(X_train, y_train)

    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(chi2, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new= selector.transform(X_test)
        dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)

Exemple #11

0

Afficher le fichier

def get_url(num_rows=None):
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/url/url_svmlight.tar.gz'
    filename = 'url_svmlight.tar.gz'
    if not os.path.isfile(filename):
        urlretrieve(url, filename)
        tar = tarfile.open(filename, "r:gz")
        tar.extractall()
        tar.close()

    num_files = 120
    files = ['url_svmlight/Day{}.svm'.format(day) for day in range(num_files)]
    data = datasets.load_svmlight_files(files)
    X = vstack(data[::2])

    if num_rows is not None:
        X = X[0:num_rows]

    return X

Exemple #12

0

Afficher le fichier

Fichier : ch1-default-data.py Projet : vivekbarsagadey/machine-learning-exp

    def support_vector_machines_datasets(self):
        """
        Support Vector Machines (SVMs)

        <label> <feature-id>:<feature-value> <feature-id>:<feature-value>
        1 qid:2 1:0 2:0 3:1 4:0.2 5:0
        2 qid:2 1:1 2:0 3:1 4:0.4 5:0

        svmlight SVM Light is a C program by Thorsten Joachims that implements a support vector machine. provides several kernels, such as linear, polynomial, radial basis function, and sigmoid
        LIBSVM -- A Library for Support Vector Machines, It supports multi-class classification.
        """

        logging.debug('----------------- Support Vector Machines  -----------')
        X_train, y_train = datasets.load_svmlight_file("../data/svmlight/example3/train.dat")
        print("Support Vector Machines \n" , X_train, y_train)

        X_train, y_train, X_test, y_test = datasets.load_svmlight_files(("../data/svmlight/example3/train.dat","../data/svmlight/example3/test.dat"))
        print(' X_train ', X_train, 'y_train ',  y_train, ' X_test ', X_test, 'y_test ', y_test)

Exemple #13

0

Afficher le fichier

Fichier : test_ranking.py Projet : liujiwen0517/GBST

    def setUpClass(cls):
        """
        Download and setup the test fixtures
        """
        from sklearn.datasets import load_svmlight_files
        # download the test data
        cls.dpath = 'demo/rank/'
        src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
        target = cls.dpath + '/MQ2008.zip'
        urllib.request.urlretrieve(url=src, filename=target)

        with zipfile.ZipFile(target, 'r') as f:
            f.extractall(path=cls.dpath)

        (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid,
         y_valid, qid_valid) = load_svmlight_files(
             (cls.dpath + "MQ2008/Fold1/train.txt", cls.dpath +
              "MQ2008/Fold1/test.txt", cls.dpath + "MQ2008/Fold1/vali.txt"),
             query_id=True,
             zero_based=False)
        # instantiate the matrices
        cls.dtrain = xgboost.DMatrix(x_train, y_train)
        cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
        cls.dtest = xgboost.DMatrix(x_test, y_test)
        # set the group counts from the query IDs
        cls.dtrain.set_group(
            [len(list(items)) for _key, items in itertools.groupby(qid_train)])
        cls.dtest.set_group(
            [len(list(items)) for _key, items in itertools.groupby(qid_test)])
        cls.dvalid.set_group(
            [len(list(items)) for _key, items in itertools.groupby(qid_valid)])
        # save the query IDs for testing
        cls.qid_train = qid_train
        cls.qid_test = qid_test
        cls.qid_valid = qid_valid

        # model training parameters
        cls.params = {
            'objective': 'rank:pairwise',
            'booster': 'gbtree',
            'silent': 0,
            'eval_metric': ['ndcg']
        }

Exemple #14

0

Afficher le fichier

def load_amazon(source_name, target_name, data_folder=None, verbose=False):
    """
    Load the amazon sentiment datasets from svmlight format files
    inputs:
        source_name : name of the source dataset
        target_name : name of the target dataset
        data_folder : path to the folder containing the files
    outputs:
        xs : training source data matrix
        ys : training source label vector
        xt : training target data matrix
        yt : training target label vector
        xtest : testing target data matrix
        ytest : testing target label vector
    """

    if data_folder is None:
        data_folder = 'data/'

    source_file = data_folder + source_name + '_train.svmlight'
    target_file = data_folder + target_name + '_train.svmlight'
    test_file = data_folder + target_name + '_test.svmlight'

    if verbose:
        print('source file:', source_file)
        print('target file:', target_file)
        print('test file:  ', test_file)

    xs, ys, xt, yt, xtest, ytest = load_svmlight_files(
        [source_file, target_file, test_file])

    # Convert sparse matrices to numpy 2D array
    xs, xt, xtest = (np.array(X.todense()) for X in (xs, xt, xtest))

    # Convert {-1,1} labels to {0,1} labels
    ys, yt, ytest = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, ytest))
    num_labels = len(set(ys))
    ys_onehot = np.eye(num_labels)[ys]
    yt_onehot = np.eye(num_labels)[yt]
    ytest_onehot = np.eye(num_labels)[ytest]

    return xs, ys_onehot, xt, yt_onehot, xtest, ytest_onehot

Exemple #15

0

Afficher le fichier

Fichier : testing.py Projet : runPenguin/xgboost

def get_mq2008(dpath):
    from sklearn.datasets import load_svmlight_files

    src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
    target = dpath + '/MQ2008.zip'
    if not os.path.exists(target):
        urllib.request.urlretrieve(url=src, filename=target)

    with zipfile.ZipFile(target, 'r') as f:
        f.extractall(path=dpath)

    (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid, y_valid,
     qid_valid) = load_svmlight_files(
         (dpath + "MQ2008/Fold1/train.txt", dpath + "MQ2008/Fold1/test.txt",
          dpath + "MQ2008/Fold1/vali.txt"),
         query_id=True,
         zero_based=False)

    return (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid,
            y_valid, qid_valid)

Exemple #16

0

Afficher le fichier

Fichier : random_selection_ml.py Projet : junjiek/cmu-exp

def select_feature_multilabel(trainfilename, testfilename):
    def returnIG(X, y):
        return randval, p
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename),  multilabel=True)

    featurenum = X_train.shape[1]
    randval = randomValues(X_train, y_train)
    p = np.ones((featurenum,1), int)
    p.reshape(featurenum,1)

    featureNum = X_train.get_shape()[1]
    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(returnIG, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new = selector.transform(X_test)
        dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)

Exemple #17

0

Afficher le fichier

Fichier : maxent_nblcr.py Projet : trunghlt/psb-adr

def run_nblcr(train,
              test,
              outfn,
              grams='123',
              clf=LogisticRegression(class_weight="auto")):
    f_train = outfn + '-train.txt'
    f_test = outfn + '-test.txt'

    ngram = [int(i) for i in grams]
    ptrain = []
    ntrain = []

    for _, row in train.iterrows():
        if row['label'] == 1:
            ptrain.append(tokenize(row['text'], ngram))
        elif row['label'] == 0:
            ntrain.append(tokenize(row['text'], ngram))

    pos_counts = build_dict(ptrain, ngram)
    neg_counts = build_dict(ntrain, ngram)

    dic, r = compute_ratio(pos_counts, neg_counts)

    generate_svmlight_file(train, dic, r, ngram, f_train)
    generate_svmlight_file(test, dic, r, ngram, f_test)

    X_train, y_train, X_test, _ = load_svmlight_files((f_train, f_test))

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    try:
        y_prob = clf.predict_proba(X_test)
    except:
        # for svm with probability output
        clf.set_params(probability=True)
        y_prob_pos = clf.predict(X_test)
        y_prob_neg = np.ones(X_test.shape[0]) - y_prob_pos
        y_prob = np.column_stack((y_prob_neg, y_prob_pos))

    return y_pred, y_prob

Exemple #18

0

Afficher le fichier

Fichier : ig_selection.py Projet : junjiek/cmu-exp

def select_feature(trainfilename, testfilename):
    def returnIG(X, y):
        return ig, p
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))

    featurenum = X_train.shape[1]
    ig = information_gain(X_train, y_train)
    ig = ig.reshape(featurenum,)
    p = np.ones((1,featurenum), int)
    p.reshape(featurenum,1)

    featureNum = X_train.get_shape()[1]
    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(returnIG, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new = selector.transform(X_test)
        sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)

Exemple #19

0

Afficher le fichier

Fichier : maxent_nblcr.py Projet : bluedrone/psb-adr

def run_nblcr(train, test, outfn, grams='123', clf=LogisticRegression(class_weight="auto")):
    f_train = outfn + '-train.txt'
    f_test = outfn + '-test.txt'
    
    ngram = [int(i) for i in grams]
    ptrain = []
    ntrain = []
        
    for _, row in train.iterrows():
        if row['label'] == 1:
            ptrain.append(tokenize(row['text'], ngram))
        elif row['label'] == 0:
            ntrain.append(tokenize(row['text'], ngram))
        
    pos_counts = build_dict(ptrain, ngram)
    neg_counts = build_dict(ntrain, ngram)
        
    dic, r = compute_ratio(pos_counts, neg_counts)
        
    generate_svmlight_file(train, dic, r, ngram, f_train)
    generate_svmlight_file(test, dic, r, ngram, f_test)
    
    X_train, y_train, X_test, _ = load_svmlight_files((f_train, f_test))
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    try:
        y_prob = clf.predict_proba(X_test)
    except:
        # for svm with probability output
        clf.set_params(probability=True)
        y_prob_pos = clf.predict(X_test)
        y_prob_neg = np.ones(X_test.shape[0]) - y_prob_pos
        y_prob = np.column_stack((y_prob_neg, y_prob_pos))
    
    return y_pred, y_prob

Exemple #20

0

Afficher le fichier

def load_e2006():
    # laod data
    feature_tr, label_tr, feature_te, label_te = load_svmlight_files(['./data/E2006.train', \
            './data/E2006.test'], n_features=150360)
    feature = vstack([feature_tr, feature_te])
    # expand 1 dimension to labels
    label = np.concatenate([label_tr, label_te], axis=0)
    # remove outliers from labels
    std_y = np.std(label)
    mean_y = np.mean(label)
    mask = np.logical_and(label > mean_y - 3.0 * std_y, label < mean_y + 3.0 * std_y)
    print(f'keep {np.sum(mask)} / {len(mask)} rows')
    # select rows
    feature = feature[mask]
    label = label[mask]

    # scale labels by standard scaler
    label = label[:, None]
    scaler = MinMaxScaler()
    scaler.fit(label)
    label = scaler.transform(label).squeeze()
    return feature * 10, label

Exemple #21

0

Afficher le fichier

Fichier : dataset.py Projet : zlb1028/sqlflow

def load_dmatrix(filename):
    '''
    NOTE(sneaxiy): XGBoost distributed training using rabit would
    split CSV/LIBSVM file into N pieces automatically, where N is
    the worker number. However, in our implementation, we dump
    different data file into each worker, and each worker should
    not split the dumped file again when training. Otherwise,
    some data would be lost. To prevent the automatic data sharding
    by XGBoost itself, we load the LIBSVM file using
    'sklearn.datasets.load_svmlight_file' to be a CSR sparse matrix
    first, and then convert it to 'xgboost.DMatrix'.

    See https://github.com/sql-machine-learning/sqlflow/issues/2326
    in detailed.
    '''
    if xgb.rabit.get_world_size() > 1:
        # XGBoost DMatrix supports to load data from file path like
        # "train.txt#train.txt.cache". The actual data path is
        # "train.txt", while "train.txt.cache" is used as the
        # external memory cache. But "train.txt#train.txt.cache"
        # is not a valid file path, and it is not supported by
        # load_svmlight_file(s). So we remove the suffix "#..."
        # here before loading the data using load_svmlight_file(s).
        if '#' in filename:
            filename = filename[0:filename.index('#')]

        if os.path.isdir(filename):
            files = [os.path.join(filename, f) for f in os.listdir(filename)]
            assert len(files) > 0, "No data file found in {}".format(filename)

            ret = load_svmlight_files(files, zero_based=True)
            X = vstack(ret[0::2])
            y = np.concatenate(ret[1::2], axis=0)
            return xgb.DMatrix(X, y, missing=XGBOOST_NULL_MAGIC)
        else:
            ret = load_svmlight_file(filename, zero_based=True)
            return xgb.DMatrix(ret[0], ret[1], missing=XGBOOST_NULL_MAGIC)
    else:
        return xgb.DMatrix(filename, missing=XGBOOST_NULL_MAGIC)

Exemple #22

0

Afficher le fichier

def load_dataset(train_path, test_path, threshold=5):
    """
    Generator the yields SvmSet for each set(train, test)
    Loads the svml format file to Sickit-Learn svml-bases dataset
    and Normalize the values(the data with MinMaxScalar)
    If the score is above threshold(defult = 5) so normalized as 1(Positive) else, 0(Negative)
    :param train_path: train set path
    :param test_path: test set path
    :param threshold: threshold to define the pivot-value
    :return: None
    """
    files = [train_path, test_path]
    dataset = datasets.load_svmlight_files(files=files,
                                           zero_based=True,
                                           query_id=True,
                                           multilabel=False)
    for (x, y, qid) in [dataset[i:i + 3] for i in range(0, len(dataset), 3)]:
        x.data = preprocessing.MinMaxScaler().fit_transform(x.data)

        for idx, score in enumerate(y):
            y[idx] = 1 if score > threshold else 0

        yield SvmSet(x=x, y=y, qid=qid)

Exemple #23

0

Afficher le fichier

def load_dataset(path_train,
                 path_valid,
                 path_test,
                 n_features,
                 multilabel=False,
                 classes_=None):
    le = LabelEncoder2(multilabel=multilabel)

    X, Y, Xvalid, Yvalid, Xtest, Ytest = load_svmlight_files(
        (path_train, path_valid, path_test),
        dtype=np.float32,
        n_features=n_features,
        multilabel=multilabel)
    if classes_ is None:
        le.fit(np.concatenate((Y, Yvalid, Ytest), axis=0))
        Y = le.transform(Y)
        Yvalid = le.transform(Yvalid)
        Ytest = le.transform(Ytest)
    else:
        le.set_classes(classes_)
        Y = le.transform(Y)
        Yvalid = le.transform(Yvalid)
    return X, Y, Xvalid, Yvalid, Xtest, Ytest

Exemple #24

0

Afficher le fichier

Fichier : experiments_amazon.py Projet : GRAAL-Research/domain_adversarial_neural_network

def load_amazon(source_name, target_name, data_folder=None, verbose=False):
    """
    Load the amazon sentiment datasets from svmlight format files
    inputs:
        source_name : name of the source dataset
        target_name : name of the target dataset
        data_folder : path to the folder containing the files
    outputs:
        xs : training source data matrix
        ys : training source label vector
        xt : training target data matrix
        yt : training target label vector
        xtest : testing target data matrix
        ytest : testing target label vector
    """

    if data_folder is None:
        data_folder = 'data/'

    source_file = data_folder + source_name + '_train.svmlight'
    target_file = data_folder + target_name + '_train.svmlight'
    test_file = data_folder + target_name + '_test.svmlight'

    if verbose:
        print('source file:', source_file)
        print('target file:', target_file)
        print('test file:  ', test_file)

    xs, ys, xt, yt, xtest, ytest = load_svmlight_files([source_file, target_file, test_file])

    # Convert sparse matrices to numpy 2D array
    xs, xt, xtest = (np.array(X.todense()) for X in (xs, xt, xtest))

    # Convert {-1,1} labels to {0,1} labels
    ys, yt, ytest = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, ytest))

    return xs, ys, xt, yt, xtest, ytest

Exemple #25

0

Afficher le fichier

def mxTrainer(relationName, train, test, train_pair, test_pair):
    X_train, y_train, X_test, y_test = load_svmlight_files([train, test])
    X_train_col = X_train.shape[1]
    X_test_col = X_test.shape[1]
    col = max(X_test_col, X_train_col)
    train_iter = mx.io.LibSVMIter(data_libsvm=train,
                                  data_shape=(col, ),
                                  batch_size=100)
    test_iter = mx.io.LibSVMIter(data_libsvm=test,
                                 data_shape=(col, ),
                                 batch_size=100)
    print(test_iter)
    mod = rankNet()
    mod.bind(data_shapes=train_iter.provide_data,
             label_shapes=train_iter.provide_label)
    mod.fit(train_iter, num_epoch=5, optimizer="AdaGrad")
    y_pred = mod.predict(test_iter)
    print(relationName + str(y_pred.shape) + str(col))
    y_pred = y_pred.asnumpy().reshape(y_pred.shape[0])
    print(str(y_pred.shape) + str(y_test.shape))
    test_pair['score'] = y_pred
    print(roc_auc_score(y_true=y_test.reshape[y_test.shape[0]],
                        y_score=y_pred))
    writeScoresInPraStyle(test_pair, train_pair, relationName)

Exemple #26

0

Afficher le fichier

def main():
    args = get_args()

    # Load training data
    data_train = load_svmlight_files(args.input)
    X_train = vstack(data_train[0::2]).toarray()
    y_train = vstack(data_train[1::2]).toarray()

    # Make model
    if args.model == 'rf':
        model = RandomForestClassifier()
        param_grid = rf_param_grid()
    elif args.model == 'svm_rbf':
        model = SVC()
        param_grid = svm_rbf_param_grid()

    # Grid search hyperparameters
    grid_search = GridSearchCV(estimator=model, scoring='average_precision', param_grid=param_grid,
                               cv=KFold(len(X_train), n_folds=args.kfolds, shuffle=True, random_state=args.seed),
                               n_jobs=args.processes, verbose=2)

    grid_search.fit(X_train, y_train)

    pkl.dump(grid_search, open('temp.pkl', 'wb'))

Exemple #27

0

Afficher le fichier

Fichier : train.py Projet : rloliveirajr/kaggle_stumbleupon

    probs.append(score_i)
      
  return probs
  
parser = argparse.ArgumentParser()
#parser.add_argument( "train_file" )
parser.add_argument( "-p", "--predict", help = "if is to make predictions in a test file", default = None )
parser.add_argument( "-t", "--predict_file", help = "if is to make predictions in a test file", default = None )
parser.add_argument( "-c", "--cross_validation", help = "if have make cross-validation", default = None )

args = parser.parse_args()

classifier = LDA(n_components=2)
#classifier = RandomForestClassifier()

X_url, y, X_title, y_t, X_body, y_b, X_a, y_a = load_svmlight_files(("url_train.txt", "title_train.txt", "body_train.txt", "all_train.txt"))
X = {"url":X_url, "title": X_title, "body": X_body, "all": X_a}

if(args.predict):
  print "Predicting"
  T_url, t, T_title, y_t, T_body, y_b, T_a, y_a = load_svmlight_files(("url_test.txt", "title_test.txt", "body_test.txt", "all_test.txt"))
  T = {"url": T_url, "title": T_title, "body": T_body, "all": T_a}
  probs = predict(classifier, X, y, T, t)
  
  f = open("sub_31-08_01h15.txt","w")
  f.write("label\n")
  for p in probs:
    line = "%f\n" % p
    f.write(line)
  f.close()
elif(args.cross_validation):

Exemple #28

0

Afficher le fichier

Fichier : filterbyDF.py Projet : junjiek/cmu-exp

from sklearn.datasets import load_svmlight_files


def documentFrequency(X, y):
    featurenum = X.shape[1]
    s = sum(X).toarray()
    p = np.ones((1, featurenum), int)
    return s.reshape(featurenum), p.reshape(featurenum, 1)


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Usage: python threshold trainfilename testfilename"
        exit(1)
    trainfilename = sys.argv[2]
    testfilename = sys.argv[3]
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))

    df = sum(X_train).toarray()[0]
    cnt = 0
    threshold = int(sys.argv[1])
    for i in range(0, len(df)):
        if df[i] >= threshold:
            cnt = cnt + 1
    selector = SelectKBest(documentFrequency, k=cnt)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)
    sklearn.datasets.dump_svmlight_file(X_train, y_train, trainfilename + "_" + str(cnt), zero_based=False)
    sklearn.datasets.dump_svmlight_file(X_test, y_test, testfilename + "_" + str(cnt), zero_based=False)
    print cnt, "features selected"

Exemple #29

0

Afficher le fichier

Fichier : pcalda.py Projet : sahuvaibhav/Capstone

    # remove axis spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)

    plt.grid()
    plt.tight_layout
    plt.show()
    
    
    
os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\dir_data")



X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
np.unique(y_train)

sklearn_lda = LDA(n_components=30)
X_lda_sklearn = sklearn_lda.fit_transform(X_train.todense(), y_train)
plot_scikit_lda(X_lda_sklearn, title='LDA vision_cuboids_histogram')
# PCA
sklearn_pca = sklearnPCA(n_components=30)
X_pca = sklearn_pca.fit_transform(X_train.todense())
plot_pca(title = 'PCA vision_cuboids_histogram')
#
X_ldapca_sklearn = sklearn_pca.fit_transform(X_lda_sklearn)
plot_scikit_lda(X_ldapca_sklearn, title='LDA+PCA LDA vision_cuboids_histogram', mirror=(-1))

Exemple #30

0

Afficher le fichier

Fichier : dataMaster_[dyadic_pruning]_[eTS,MD].py Projet : tinnguyen96/pac-bayes-tree

    letter_dataset = np.genfromtxt('realData/letter-recognition.data',
                                   delimiter=",",
                                   converters={0: letter_label})
    ndim = len(letter_dataset[0])
    data = np.zeros((len(letter_dataset), ndim - 1))
    target = np.zeros((len(letter_dataset)))
    for i in xrange(len(letter_dataset)):
        target[i] = letter_dataset[i][0].astype(int)  # last feature is label
        for j in xrange(1, ndim):
            data[i, j - 1] = letter_dataset[i][j]
    target = target.astype(int)

elif (dataset_name == "gas"):
    files = ["realData/Gas/batch" + str(i) + ".dat" for i in xrange(1, 11)]
    batches = skd.load_svmlight_files(files)

    data = batches[0].todense()
    target = batches[1]
    len_target = len(target)
    target = np.reshape(target, (len_target, 1))

    for idx in xrange(2, 11):
        batch_data = batches[(idx - 1) * 2].todense()
        batch_target = batches[2 * idx - 1]
        len_batch_target = len(batch_target)
        batch_target = np.reshape(batch_target, (len_batch_target, 1))

        data = np.concatenate((data, batch_data), axis=0)
        target = np.concatenate((target, batch_target), axis=0)

Exemple #31

0

Afficher le fichier

Fichier : Stacking.py Projet : Accagain2014/ML

	加载数据
'''
fe_dir = 'xgb_feature_pool'
fe_file = 'selected_feature.csv'
train_date = '2016-04-06'
valid_date = '2016-04-10'
test_date = '2016-04-16'

train_data = pd.read_csv(os.path.join(fe_dir, train_date, fe_file))
valid_data = pd.read_csv(os.path.join(fe_dir, valid_date, fe_file))
test_data = pd.read_csv(os.path.join(fe_dir, test_date, fe_file))


train_sparse_file = 'XGB输出的稀疏特征/v3.train.svm'
valid_sparse_file = 'XGB输出的稀疏特征/v3.valid.svm'
X_train_sparse, _, X_valid_sparse, _ = load_svmlight_files([train_sparse_file, valid_sparse_file])

valid_data_9 = pd.read_csv(os.path.join(fe_dir, '2016-04-09', fe_file))
valid_data_11 = pd.read_csv(os.path.join(fe_dir, '2016-04-11', fe_file))

train_data.fillna(-1, inplace=True)
valid_data.fillna(-1, inplace=True)
test_data.fillna(-1, inplace=True)
valid_data_9.fillna(-1, inplace=True)
valid_data_11.fillna(-1, inplace=True)

print 'Read done.'


'''
    得到交叉验证索引

Exemple #32

0

Afficher le fichier

Fichier : predict.py Projet : wang122300090/DM-Competition-Getting-Started

    model = Word2Vec.load(model_name)
    
    print "Creating the w2v vectors...\n"

    X_train_w2v = scale(getAvgFeatureVecs(getCleanReviews(train), model, n_dim))
    X_test_w2v = scale(getAvgFeatureVecs(getCleanReviews(test), model, n_dim))
    
    print "Generating the svmlight-format files...\n"
    
    generate_svmlight_files(train, test, '123', '../data/nbsvm')
    
    print "Creating the nbsvm...\n"
    
    files = ("../data/nbsvm-train.txt", "../data/nbsvm-test.txt")
     
    X_train_nbsvm, _, X_test_nbsvm, _ = load_svmlight_files(files)
    
    print "Combing the bag of words and the w2v vectors...\n"
    
    X_train_bwv = hstack([X_train_bow, X_train_w2v])
    X_test_bwv = hstack([X_test_bow, X_test_w2v])

    
    print "Combing the bag of words and the d2v vectors...\n"
    
    X_train_bdv = hstack([X_train_bow, X_train_d2v])
    X_test_bdv = hstack([X_test_bow, X_test_d2v])

    
    print "Checking the dimension of training vectors"

Exemple #33

0

Afficher le fichier

Fichier : test4.py Projet : sahuvaibhav/YouTube-Video-Classification

def textpCoverX():
    os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train")
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    data_df = pd.DataFrame()

    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []

    for file in glob.glob("text*.gz"):
        print(file)
        X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(
            (gzip.open(path + "train\\" + file),
             gzip.open(path + "test\\" + file),
             gzip.open(path + "validation\\" + file)))

        X_train = X_train[y_train != 31]
        X_test = X_test[y_test != 31]
        X_val = X_val[y_val != 31]
        y_train = y_train[y_train != 31]
        y_test = y_test[y_test != 31]
        y_val = y_val[y_val != 31]

        svmClf = Pipeline([
            ('clf',
             SGDClassifier(loss='log',
                           penalty='l1',
                           alpha=1e-6,
                           n_iter=10,
                           random_state=88)),
        ])
        svmClf = svmClf.fit(X_train, y_train)

        predicted_train = svmClf.predict(X_train)
        train_acc = np.mean(predicted_train == y_train)
        print "Train Model Accuracy %f" % train_acc
        train_post = pd.DataFrame(svmClf.predict_proba(X_train))

        predicted_test = svmClf.predict(X_test)
        test_acc = np.mean(predicted_test == y_test)
        print "Test Model Accuracy %f" % test_acc
        test_post = pd.DataFrame(svmClf.predict_proba(X_test))

        predicted_val = svmClf.predict(X_val)
        val_acc = np.mean(predicted_val == y_val)
        print "Validation Model Accuracy %f" % val_acc
        val_post = pd.DataFrame(svmClf.predict_proba(X_val))

        train_entropy = entropy(train_post)

        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)

        test_entropy = entropy(test_post)

        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)

        val_entropy = entropy(val_post)

        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)

        temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]])
        data_df = data_df.append(temp, ignore_index=True)

    return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df

Exemple #34

0

Afficher le fichier

def test_load_invalid_file2():
    with pytest.raises(ValueError):
        load_svmlight_files([datafile, invalidfile, datafile])

Exemple #35

0

Afficher le fichier

Fichier : Stacking.py Projet : leetcode-notes/CodeBase

'''
	加载数据
'''
fe_dir = 'xgb_feature_pool'
fe_file = 'selected_feature.csv'
train_date = '2016-04-06'
valid_date = '2016-04-10'
test_date = '2016-04-16'

train_data = pd.read_csv(os.path.join(fe_dir, train_date, fe_file))
valid_data = pd.read_csv(os.path.join(fe_dir, valid_date, fe_file))
test_data = pd.read_csv(os.path.join(fe_dir, test_date, fe_file))

train_sparse_file = 'XGB输出的稀疏特征/v3.train.svm'
valid_sparse_file = 'XGB输出的稀疏特征/v3.valid.svm'
X_train_sparse, _, X_valid_sparse, _ = load_svmlight_files(
    [train_sparse_file, valid_sparse_file])

valid_data_9 = pd.read_csv(os.path.join(fe_dir, '2016-04-09', fe_file))
valid_data_11 = pd.read_csv(os.path.join(fe_dir, '2016-04-11', fe_file))

train_data.fillna(-1, inplace=True)
valid_data.fillna(-1, inplace=True)
test_data.fillna(-1, inplace=True)
valid_data_9.fillna(-1, inplace=True)
valid_data_11.fillna(-1, inplace=True)

print 'Read done.'
'''
    得到交叉验证索引
'''
from sklearn.model_selection import KFold

Exemple #36

0

Afficher le fichier

Fichier : test4.py Projet : sahuvaibhav/YouTube-Video-Classification

        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)

        temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]])
        data_df = data_df.append(temp, ignore_index=True)

    return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df


#=============================================== Main =================================================================

#os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\dir_data")
os.chdir("C:\Users\Vaibhav\Desktop\dir_data\dir_data")
X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(
    ("train\\vision_hist_motion_estimate.txt",
     "test\\vision_hist_motion_estimate.txt",
     "validation\\vision_hist_motion_estimate.txt"))

#================ First Level of Fusion - Audio ===============================
train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df = pCoverX(
    'audio')
data_df.columns = [
    'filename', 'train Accuracy', 'test Accuracy', 'validation Accuracy'
]
data_df.to_csv('Audio_preComb_Acc.csv', index=False)

alpha = 1
comb1_audio_train = combiner(train_post_array, train_entropy_array, alpha)
comb1_audio_test = combiner(test_post_array, test_entropy_array, alpha)
comb1_audio_val = combiner(val_post_array, val_entropy_array, alpha)

Exemple #37

0

Afficher le fichier

Fichier : test_svmlight_format.py Projet : kkuunnddaann/scikit-learn

def test_load_invalid_file2():
    load_svmlight_files([datafile, invalidfile, datafile])

Exemple #38

0

Afficher le fichier

Fichier : ex1.py Projet : markzy/MLCLAS

import numpy as np
from scipy.sparse import csr_matrix
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from mlclas.ensemble import BinaryRelevance, ClassifierChains, CalibratedLabelRanking, RandomKLabelsets, MLKNN
from mlclas.tree import MLDecisionTree
from mlclas.neural import BPMLL
from mlclas.svm import RankingSVM
from mlclas.stats import UniversalMetrics

files = ['datasets/scene_train', 'datasets/scene_test']

# load files
data = datasets.load_svmlight_files(files, multilabel=True)
train_data = data[0]
train_target = np.array(MultiLabelBinarizer().fit_transform(data[1]))
test_data = data[2]
test_target = data[3]

# feature extraction using PCA
feature_size = train_data.shape[1]
pca = PCA(n_components=(feature_size * 10) // 100)
train_data_trans = csr_matrix(pca.fit_transform(train_data.todense()))
test_data_trans = csr_matrix(pca.transform(test_data.todense()))

"""
    train and predict using any of following scripts:

    1.  result = BinaryRelevance(LinearSVC()).fit(train_data, train_target).predict(test_data)

Exemple #39

0

Afficher le fichier

Fichier : EntropyFusion.py Projet : sahuvaibhav/Capstone

        x = range(len(data))
        plt.xticks(x,data[data.columns[0]],rotation='vertical')
        for i in range(1,len(data.columns)):
            plt.plot(x,data[data.columns[i]])
        
    plt.legend(data.columns[1:], loc='upper left')
    plt.xlabel(data.columns[0])
    plt.ylabel('Accuracy')
    plt.title('Accuracy plot for ' + fileName)
    plt.show()

#=============================================== Main =================================================================

os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\Capstone")
#os.chdir("C:\Users\Vaibhav\Desktop\dir_data\dir_data")
X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_hist_motion_estimate.txt", "test\\vision_hist_motion_estimate.txt","validation\\vision_hist_motion_estimate.txt"))
y_train = y_train[y_train!=31]
y_test = y_test[y_test!=31]
y_val = y_val[y_val!=31]
#y_train = y_train[y_train <=2]
#y_test = y_test[y_test<=2]
#y_val = y_val[y_val<=2]
#================ First Level of Fusion - Audio ===============================
n_guass =5
nClass = 30 
train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df = pCoverX('audio',n_guass,tech = 'LinearSVC',C= 0.5,nClass=30)
data_df.columns = ['filename','train Accuracy','test Accuracy','validation Accuracy']
data_df.to_csv('Audio_preComb_Acc0801.csv',index=False)

audioComb1Acc = pd.DataFrame()
for alpha in [1,2,3,4,5]:

Exemple #40

0

Afficher le fichier

Fichier : EntropyFusion.py Projet : sahuvaibhav/Capstone

def textpCoverX():
    #os.chdir("F:\\Analytics\\ISB Study\\Capstone\\dir_data\\dir_data\\train")
    #path = "F:\\Analytics\\ISB Study\\Capstone\\dir_data\\dir_data\\"
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    os.chdir(path+'train')
    
    data_df = pd.DataFrame()
    
    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []
    
    for file in glob.glob("text*.gz"):
        print(file)
        X_train, y_train, X_test, y_test,X_val, y_val = load_svmlight_files((gzip.open(path+"train\\"+file), gzip.open(path+"test\\"+file),gzip.open(path+"validation\\"+file)))    
            
        X_train = X_train[y_train!=31]
        X_test = X_test[y_test!=31]
        X_val = X_val[y_val!=31]
        y_train = y_train[y_train!=31]
        y_test = y_test[y_test!=31]
        y_val = y_val[y_val!=31]
        
        svmClf = Pipeline([ ('clf', SGDClassifier(loss='log', penalty='l1',alpha=1e-6, n_iter=10, random_state=88)),])
        svmClf = svmClf.fit(X_train, y_train)
        
        predicted_train = svmClf.predict(X_train)
        train_acc = np.mean(predicted_train == y_train)     
        print "Train Model Accuracy %f" % train_acc    
        train_post = pd.DataFrame(svmClf.predict_proba(X_train))
        
        predicted_test = svmClf.predict(X_test)
        test_acc = np.mean(predicted_test == y_test)        
        print "Test Model Accuracy %f" % test_acc
        test_post = pd.DataFrame(svmClf.predict_proba(X_test))    
        
        predicted_val = svmClf.predict(X_val)
        val_acc = np.mean(predicted_val == y_val)     
        print "Validation Model Accuracy %f" % val_acc
        val_post = pd.DataFrame(svmClf.predict_proba(X_val))    
        
        
        train_entropy = entropy(train_post)
        
        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)
    
        test_entropy = entropy(test_post)
    
        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)
        
        val_entropy = entropy(val_post)
    
        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)
        
        temp = pd.DataFrame([[file,train_acc,test_acc,val_acc]])        
        data_df = data_df.append(temp,ignore_index =True)
        
    return train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df

Exemple #41

0

Afficher le fichier

Fichier : num_features.py Projet : ryrizo/nlp

#splitter
from sklearn.datasets import load_svmlight_files
trn_X, trn_y, tst_X, tst_y = load_svmlight_files(("C:/Users/Ryan/git/nlp/trn.dat", "C:/Users/Ryan/git/nlp/tst.dat"))
print trn_X.shape[1]

Exemple #42

0

Afficher le fichier

Fichier : svm.py Projet : MRHPR/Machine-Learning-Assignments

from sklearn.datasets import load_svmlight_files
from sklearn import svm
from sklearn.preprocessing import normalize, label_binarize
from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV
import pylab as pl

# read training data and validation data merged using cat in satimage.scale.train
X_train, Y_train, X_test, Y_test = load_svmlight_files(["satimage.scale.train","satimage.scale.t"])

# normalize & binarize
X_train = normalize(X_train)
Y_train = label_binarize(Y_train,classes=[1,2,3,4,5,6])[:,5]
X_test = normalize(X_test)
Y_test = label_binarize(Y_test,classes=[1,2,3,4,5,6])[:,5]

# build the classifier

def svm_score(c,d):
    clf = svm.SVC(C=c,kernel='poly',degree=d)
    kfold = KFold(len(Y_train), n_folds=5)
    scores = cross_val_score(clf,X_train,Y_train,cv=kfold,n_jobs=-1)
    return scores.mean()

x = pl.linspace(1,20,20)
y1=[]
y2=[]
y3=[]
for i in x:
    y1.append(svm_score(i,1))
    y2.append(svm_score(i,2))

Exemple #43

0

Afficher le fichier

Fichier : generate_data_for_ml.py Projet : iharshulhan/ir-harbour

def get_data(type):
    return load_svmlight_files(
        ("../data/Fold1/" + type, "../data/Fold2/" + type,
         "../data/Fold3/" + type, "../data/Fold4/" + type,
         "../data/Fold5/" + type))

Exemple #44

0

Afficher le fichier

Fichier : featureClassification.py Projet : OzMaatuk/ml-score

def fiveFold():

    # Feature groups
    # protocol_dependent = range(13) + range(66,69)
    # protocol_dependent = range(23) + range(66,69)
    # peak features
    # protocol_dependent = range(23,41)
    # All but peak
    # protocol_dependent = range(23) + range(41,69)
    fsslv_cipher_suites = [6, 7, 8, 9, 10, 11, 12]
    protocol_dependent = []

    # Load data
    data_path = os.getcwd() + "/data_set/libSVM"

    train_0 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_0_train"
    test_0 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_0_test"
    train_1 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_1_train"
    test_1 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_1_test"
    train_2 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_2_train"
    test_2 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_2_test"
    train_3 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_3_train"
    test_3 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_3_test"
    train_4 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_4_train"
    test_4 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_4_test"

    X_train_0, y_train_0, X_test_0, y_test_0 = load_svmlight_files(
        (train_0, test_0))
    X_train_1, y_train_1, X_test_1, y_test_1 = load_svmlight_files(
        (train_1, test_1))
    X_train_2, y_train_2, X_test_2, y_test_2 = load_svmlight_files(
        (train_2, test_2))
    X_train_3, y_train_3, X_test_3, y_test_3 = load_svmlight_files(
        (train_3, test_3))
    X_train_4, y_train_4, X_test_4, y_test_4 = load_svmlight_files(
        (train_4, test_4))

    df_train_0 = pd.DataFrame(X_train_0.toarray())
    df_test_0 = pd.DataFrame(X_test_0.toarray())
    df_train_1 = pd.DataFrame(X_train_1.toarray())
    df_test_1 = pd.DataFrame(X_test_1.toarray())
    df_train_2 = pd.DataFrame(X_train_2.toarray())
    df_test_2 = pd.DataFrame(X_test_2.toarray())
    df_train_3 = pd.DataFrame(X_train_3.toarray())
    df_test_3 = pd.DataFrame(X_test_3.toarray())
    df_train_4 = pd.DataFrame(X_train_4.toarray())
    df_test_4 = pd.DataFrame(X_test_4.toarray())

    X_train_0 = df_train_0.drop(protocol_dependent, axis=1)
    X_test_0 = df_test_0.drop(protocol_dependent, axis=1)
    X_train_1 = df_train_1.drop(protocol_dependent, axis=1)
    X_test_1 = df_test_1.drop(protocol_dependent, axis=1)
    X_train_2 = df_train_2.drop(protocol_dependent, axis=1)
    X_test_2 = df_test_2.drop(protocol_dependent, axis=1)
    X_train_3 = df_train_3.drop(protocol_dependent, axis=1)
    X_test_3 = df_test_3.drop(protocol_dependent, axis=1)
    X_train_4 = df_train_4.drop(protocol_dependent, axis=1)
    X_test_4 = df_test_4.drop(protocol_dependent, axis=1)

    # X_train_0 = randomProtocolValues(X_train_0)
    # X_test_0 = randomProtocolValues(X_test_0)
    # X_train_1 = randomProtocolValues(X_train_1)
    # X_test_1 = randomProtocolValues(X_test_1)
    # X_train_2 = randomProtocolValues(X_train_2)
    # X_test_2 = randomProtocolValues(X_test_2)
    # X_train_3 = randomProtocolValues(X_train_3)
    # X_test_3 = randomProtocolValues(X_test_3)
    # X_train_4 = randomProtocolValues(X_train_4)
    # X_test_4 = randomProtocolValues(X_test_4)

    # Prepare ensemble method
    estimators = []
    model1 = KNeighborsClassifier(n_neighbors=16,
                                  algorithm='ball_tree',
                                  metric='canberra',
                                  n_jobs=-1)
    estimators.append(('knn', model1))
    model2 = SVC(gamma=0.0078125, C=8192, probability=False)
    estimators.append(('svmrbf', model2))
    model3 = DecisionTreeClassifier()  #max_depth=50)
    estimators.append(('DecisionTree', model3))
    model4 = RandomForestClassifier(n_estimators=100,
                                    oob_score=True,
                                    n_jobs=-1)
    estimators.append(('RandomForest', model4))
    model5 = XGBClassifier(max_depth=10, n_estimators=100, learning_rate=0.1)
    estimators.append(('XGBoost', model5))

    # ensemble = VotingClassifier(estimators,voting='hard')
    ensemble = CategoryClassifier()

    # CategoricalEnsembleVoting(X_train_0, y_train_0, X_test_0, y_test_0)
    oneFold(X_train_0, y_train_0, X_test_0, y_test_0, ensemble)
    oneFold(X_train_1, y_train_1, X_test_1, y_test_1, ensemble)
    oneFold(X_train_2, y_train_2, X_test_2, y_test_2, ensemble)
    oneFold(X_train_3, y_train_3, X_test_3, y_test_3, ensemble)
    oneFold(X_train_4, y_train_4, X_test_4, y_test_4, ensemble)

Exemple #45

0

Afficher le fichier

    data_x, data_y = data[0][:1000, :5], data[1][:1000]
    data_y_binary = (data_y > 5).astype(np.int32)

    print("Binary classification")
    print("training model")
    model = xgboost.XGBClassifier(n_estimators=10)
    model.fit(data_x, data_y_binary)

    features = ["f{0}".format(i) for i in range(data_x.shape[1])]
    target_names = [
        "cls{0}".format(i) for i in range(len(np.unique(data_y_binary)))
    ]

    bdt = BDTxgboost(model, features, target_names)
    bdt.to_tmva("test.xml")
    bdt.setup_tmva("test.xml")

    d1 = 0.0
    for irow in range(data_x.shape[0]):
        predA1 = bdt.eval_tmva(data_x[irow, :])
        predB1 = bdt.eval(data_x[irow, :])
        d1 += np.abs((predA1 - predB1) / predA1)


if __name__ == "__main__":
    print("fetching data")
    data = load_svmlight_files(("usps", "usps.t"))

    #simple_test_xgboost()
    unittest.main()

Exemple #46

0

Afficher le fichier

Fichier : test4.py Projet : sahuvaibhav/YouTube-Video-Classification

def pCoverX(featureFamily):
    os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train")
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    data_df = pd.DataFrame()
    n_guass = 2
    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []
    fileType = featureFamily + '*.gz'
    for file in glob.glob(fileType):
        print(file)
        X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(
            (gzip.open(path + "train\\" + file),
             gzip.open(path + "test\\" + file),
             gzip.open(path + "validation\\" + file)))
        #X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
        X_train = X_train[y_train != 31]
        X_test = X_test[y_test != 31]
        X_val = X_val[y_val != 31]
        y_train = y_train[y_train != 31]
        y_test = y_test[y_test != 31]
        y_val = y_val[y_val != 31]
        #========================= Feature Selection using Variance Thresold =============================================================
        X_train_new, X_test_new, X_val_new = featureSelection(X_train,
                                                              X_test,
                                                              X_val,
                                                              y_train,
                                                              log=True,
                                                              tech='LinearSVC')
        #========================= Mixture of Guassian ============================================================
        train_prob, test_prob, val_prob = pXoverC(X_train_new, y_train,
                                                  X_test_new, y_test,
                                                  X_val_new, y_val, n_guass)
        #========================= Calculating Prior, Posterior and Entropy ============================================================
        prr = prior(y_train)
        train_post = posterior(train_prob, prr)
        train_entropy = entropy(train_post)

        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)

        test_post = posterior(test_prob, prr)
        test_entropy = entropy(test_post)

        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)

        val_post = posterior(val_prob, prr)
        val_entropy = entropy(val_post)

        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)

        train_acc, c_mat = checkAccuracy(train_post, y_train)
        test_acc, c_mat = checkAccuracy(test_post, y_test)
        val_acc, c_mat = checkAccuracy(val_post, y_val)
        temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]])
        data_df = data_df.append(temp, ignore_index=True)

    return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df

Exemple #47

0

Afficher le fichier

Fichier : q5.py Projet : oryband/homework

def svm_skin(X_train, y_train, X_test, y_test):
    """Learn the skin data sets with SVM with Linear kernel.

    X_*: Samples.
    y_*: labels.
    """
    print 'SVM w/ Linear kernel'
    clf = svm.LinearSVC()
    clf.fit(X_train, y_train)
    score = 100 * clf.score(X_test.toarray(), y_test)

    print 'SVM score: %.2f%%' % score
    return score


if __name__ == '__main__':
    # `data_size` is an integer which controls how big the data set is.
    # Use none for to use the whole dataset.
    # split_libsvm_dataset(path='skin.txt', data_size=None)

    # Load train and test samples (X) + labels (y).
    X_train, y_train, X_test, y_test = load_svmlight_files(
        ('skin-train.libsvm', 'skin-test.libsvm'))

    svm_skin(X_train, y_train, X_test, y_test)

    # iterations, scores = adaboost_skin(X_train, y_train, X_test, y_test)
    # graph = plot_success_per_size(iterations, scores)
    # show()

Exemple #48

0

Afficher le fichier

    for key in data:
        print('{"%s":"%s"}' % (key, data[key]))


write_dict({'pca_file': 'pca_plot.png'})

import datetime
print(datetime.datetime.now())

from numpy import genfromtxt
print("feature_file:", feature_file)

if '.csv' in feature_file:
    X = genfromtxt(feature_file, delimiter=',')
elif '.libsvm' in feature_file:
    X, y = datasets.load_svmlight_files([feature_file])
    X = X.toarray()
#    y = y.toarray()

svc = sc.load_model(args.model_file)

if svc == None:
    svc = svm.SVC(C=args.C,
                  kernel=args.kernel,
                  degree=args.degree,
                  gamma=args.gamma,
                  coef0=args.coef0,
                  shrinking=args.shrinking,
                  probability=args.probability,
                  tol=args.tol,
                  cache_size=args.cache_size,

Exemple #49

0

Afficher le fichier

Fichier : classifier_libsvm.py Projet : jubatus/jubakit

===================================================

In this example we show how to handle LIBSVM file format.
"""

from sklearn.datasets import load_svmlight_files
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load LIBSVM files.
# Note that these example files are not included in this repository.
# You can fetch them from: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20
print("Loading LIBSVM files...")
(train_X, train_y, test_X, test_y) = load_svmlight_files(['news20', 'news20.t'])

# Create a Train Dataset.
print("Creating train dataset...")
train_ds = Dataset.from_matrix(train_X, train_y)

# Create a Test Dataset
print("Creating test dataset...")
test_ds = Dataset.from_matrix(test_X, test_y)

# Create a Classifier Service
classifier = Classifier.run(Config())

# Train the classifier.
print("Training...")
for (idx, _) in classifier.train(train_ds):

Exemple #50

0

Afficher le fichier

Fichier : test_svmlight_format.py Projet : utsav2601/digit-recognizer

def test_load_invalid_file2():
    load_svmlight_files([datafile, invalidfile, datafile])

Exemple #51

0

Afficher le fichier

Fichier : get_data.py Projet : vivekaxl/DataGeneration

# import pdb
# pdb.set_trace()
# print "Features: ", len(data["data"][0])
# print "Instances: ", len(data["data"])
# print len(set(data["target"]))

data = datasets.load_mlcomp()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
import pdb
pdb.set_trace()

data = datasets.load_sample_image()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))

data = datasets.load_sample_images()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))

data = datasets.load_svmlight_file()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))

data = datasets.load_svmlight_files()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))