def test_load_svmlight_files():
    X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2, dtype=np.float32)
    assert_array_equal(X_train.toarray(), X_test.toarray())
    assert_array_equal(y_train, y_test)
    assert_equal(X_train.dtype, np.float32)
    assert_equal(X_test.dtype, np.float32)

    X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3, dtype=np.float64)
    assert_equal(X1.dtype, X2.dtype)
    assert_equal(X2.dtype, X3.dtype)
    assert_equal(X3.dtype, np.float64)
Exemple #2
0
def classify_test(feature_list=[], classifiers=[], root_path='./'):
    #load data set
    datasets = []
    for name in feature_list:
        logging.log(logging.DEBUG, 'loading data: %s ...' % name)
        filenames = tuple(['./feature/%s_%s' % (name, tag) for tag in ['train.txt', 'test.txt']])
        X_train, y_train, X_test, y_test = load_svmlight_files(filenames)
        datasets.append((name, X_train, y_train, X_test, y_test))

    #make directory to store results
    result_path = path.join(root_path, 'results')
    if path.exists(result_path):
        assert path.isdir(result_path), 'data must be a directory!'
    else:
        system('mkdir ' + result_path)

    for clf in classifiers:
        for feature in datasets:
            clf_name = clf.__class__.__name__
            feature_name, X_train, y_train, X_test, y_test = feature
            combine_name = feature_name+'_'+clf_name
            info = {}

            logging.log(logging.DEBUG, 'classification test: %s ...' % combine_name)

            logging.log(logging.DEBUG, 'training...')
            t0 = time()
            clf.fit(X_train, y_train)
            t1 = time()
            info['training_time'] = t1-t0

            logging.log(logging.DEBUG, 'testing on training...')
            pred_y = clf.predict(X_train)
            training_acc = accuracy_score(y_train, pred_y)
            logging.log(logging.DEBUG, 'error rate on training set: %f' % (1.0 - training_acc))
            info['training_error'] = 1.0 - training_acc
            fout = open(path.join(result_path, combine_name+'_train.txt'), 'w')
            for y in pred_y:
                print >>fout, y
            fout.close()

            logging.log(logging.DEBUG, 'testing...')
            t0 = time()
            pred_y = clf.predict(X_test)
            t1 = time()
            info['test_time'] = t1-t0
            test_acc = accuracy_score(y_test, pred_y)
            logging.log(logging.DEBUG, 'error rate on test set: %f' % (1.0 - test_acc))
            info['test_error'] = 1.0 - test_acc
            fout = open(path.join(result_path, combine_name+'_test.txt'), 'w')
            for y in pred_y:
                print >>fout, y
            fout.close()

            yield combine_name, feature_name, clf_name, info
Exemple #3
0
def pCoverX(featureFamily):
    os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train")
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    data_df = pd.DataFrame()
    n_guass = 2
    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []
    fileType = featureFamily+'*.gz'
    for file in glob.glob(fileType):
        print(file)
        X_train, y_train, X_test, y_test,X_val, y_val = load_svmlight_files((gzip.open(path+"train\\"+file), gzip.open(path+"test\\"+file),gzip.open(path+"validation\\"+file)))    
        #X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
        X_train = X_train[y_train!=31]
        X_test = X_test[y_test!=31]
        X_val = X_val[y_val!=31]
        y_train = y_train[y_train!=31]
        y_test = y_test[y_test!=31]
        y_val = y_val[y_val!=31]
    #========================= Feature Selection using Variance Thresold =============================================================
        X_train_new, X_test_new , X_val_new = featureSelection(X_train,X_test,X_val,y_train, log=True,tech = 'LinearSVC')
    #========================= Mixture of Guassian ============================================================
        train_prob,test_prob,val_prob = pXoverC(X_train_new, y_train, X_test_new, y_test, X_val_new, y_val, n_guass)
    #========================= Calculating Prior, Posterior and Entropy ============================================================
        prr = prior(y_train)
        train_post = posterior(train_prob,prr)
        train_entropy = entropy(train_post)
        
        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)
    
        test_post = posterior(test_prob,prr)
        test_entropy = entropy(test_post)
    
        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)
        
        val_post = posterior(val_prob,prr)
        val_entropy = entropy(val_post)
    
        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)
        
        train_acc,c_mat = checkAccuracy(train_post,y_train)
        test_acc,c_mat = checkAccuracy(test_post,y_test)
        val_acc,c_mat = checkAccuracy(val_post,y_val)
        temp = pd.DataFrame([[file,train_acc,test_acc,val_acc]])        
        data_df = data_df.append(temp,ignore_index =True)
        
    return train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df
def test_load_zero_based_auto():
    data1 = "-1 1:1 2:2 3:3\n"
    data2 = "-1 0:0 1:1\n"

    f1 = BytesIO(data1)
    X, y = load_svmlight_file(f1, zero_based="auto")
    assert_equal(X.shape, (1, 3))

    f1 = BytesIO(data1)
    f2 = BytesIO(data2)
    X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto")
    assert_equal(X1.shape, (1, 4))
    assert_equal(X2.shape, (1, 4))
def test_load_with_qid():
    # load svmfile with qid attribute
    data = """
    3 qid:1 1:0.53 2:0.12
    2 qid:1 1:0.13 2:0.1
    7 qid:2 1:0.87 2:0.12"""
    X, y = load_svmlight_file(BytesIO(data), query_id=False)
    assert_array_equal(y, [3, 2, 7])
    assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])
    res1 = load_svmlight_files([BytesIO(data)], query_id=True)
    res2 = load_svmlight_file(BytesIO(data), query_id=True)
    for X, y, qid in (res1, res2):
        assert_array_equal(y, [3, 2, 7])
        assert_array_equal(qid, [1, 1, 2])
        assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])
Exemple #6
0
def test_load_with_qid():
    # load svmfile with qid attribute
    data = b("""
    3 qid:1 1:0.53 2:0.12
    2 qid:1 1:0.13 2:0.1
    7 qid:2 1:0.87 2:0.12""")
    X, y = load_svmlight_file(BytesIO(data), query_id=False)
    assert_array_equal(y, [3, 2, 7])
    assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
    res1 = load_svmlight_files([BytesIO(data)], query_id=True)
    res2 = load_svmlight_file(BytesIO(data), query_id=True)
    for X, y, qid in (res1, res2):
        assert_array_equal(y, [3, 2, 7])
        assert_array_equal(qid, [1, 1, 2])
        assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
def main():
    x_train, y_train, x_test, y_test = load_svmlight_files(
        ['data/rank.train', 'data/rank.test'])
    train_query = pd.read_csv('data/rank.train.query',
                              header=None).values.flatten()

    model = lgbm.LGBMRanker(num_leaves=50, n_estimators=200, random_state=42)
    print(model)
    model.fit(x_train,
              y_train,
              group=train_query,
              eval_metric='ndgc',
              eval_at=[1, 3, 5])
    preds = model.predict(x_test)

    print(spearmanr(y_test, preds))
    print('DONE')
def select_feature(trainfilename, testfilename):
    def returnCHI(X, y):
        return chivalue
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))
    
    featureNum = X_train.get_shape()[1]
    chivalue = chi2(X_train, y_train)

    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(chi2, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new= selector.transform(X_test)
        sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
Exemple #9
0
def load_amazon(source_name, target_name, data_folder=None, verbose=False):
    if data_folder is None:
        data_folder = './data/'
    source_file = data_folder + source_name + '_train.svmlight'
    target_file = data_folder + target_name + '_train.svmlight'
    test_file = data_folder + target_name + '_test.svmlight'
    if verbose:
        print('source file:', source_file)
        print('target file:', target_file)
        print('test file:  ', test_file)

    xs, ys, xt, yt, xt_test, yt_test = load_svmlight_files(
        [source_file, target_file, test_file])
    ys, yt, yt_test = (np.array((y + 1) / 2, dtype=int)
                       for y in (ys, yt, yt_test))

    return xs, ys, xt, yt, xt_test, yt_test
def select_feature(trainfilename, testfilename):
    def returnCHI(X, y):
        return chivalue
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True)
    
    featureNum = X_train.get_shape()[1]
    chivalue = chi2(X_train, y_train)

    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(chi2, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new= selector.transform(X_test)
        dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
Exemple #11
0
def get_url(num_rows=None):
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/url/url_svmlight.tar.gz'
    filename = 'url_svmlight.tar.gz'
    if not os.path.isfile(filename):
        urlretrieve(url, filename)
        tar = tarfile.open(filename, "r:gz")
        tar.extractall()
        tar.close()

    num_files = 120
    files = ['url_svmlight/Day{}.svm'.format(day) for day in range(num_files)]
    data = datasets.load_svmlight_files(files)
    X = vstack(data[::2])

    if num_rows is not None:
        X = X[0:num_rows]

    return X
    def support_vector_machines_datasets(self):
        """
        Support Vector Machines (SVMs)

        <label> <feature-id>:<feature-value> <feature-id>:<feature-value>
        1 qid:2 1:0 2:0 3:1 4:0.2 5:0
        2 qid:2 1:1 2:0 3:1 4:0.4 5:0

        svmlight SVM Light is a C program by Thorsten Joachims that implements a support vector machine. provides several kernels, such as linear, polynomial, radial basis function, and sigmoid
        LIBSVM -- A Library for Support Vector Machines, It supports multi-class classification.
        """

        logging.debug('----------------- Support Vector Machines  -----------')
        X_train, y_train = datasets.load_svmlight_file("../data/svmlight/example3/train.dat")
        print("Support Vector Machines \n" , X_train, y_train)

        X_train, y_train, X_test, y_test = datasets.load_svmlight_files(("../data/svmlight/example3/train.dat","../data/svmlight/example3/test.dat"))
        print(' X_train ', X_train, 'y_train ',  y_train, ' X_test ', X_test, 'y_test ', y_test)
Exemple #13
0
    def setUpClass(cls):
        """
        Download and setup the test fixtures
        """
        from sklearn.datasets import load_svmlight_files
        # download the test data
        cls.dpath = 'demo/rank/'
        src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
        target = cls.dpath + '/MQ2008.zip'
        urllib.request.urlretrieve(url=src, filename=target)

        with zipfile.ZipFile(target, 'r') as f:
            f.extractall(path=cls.dpath)

        (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid,
         y_valid, qid_valid) = load_svmlight_files(
             (cls.dpath + "MQ2008/Fold1/train.txt", cls.dpath +
              "MQ2008/Fold1/test.txt", cls.dpath + "MQ2008/Fold1/vali.txt"),
             query_id=True,
             zero_based=False)
        # instantiate the matrices
        cls.dtrain = xgboost.DMatrix(x_train, y_train)
        cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
        cls.dtest = xgboost.DMatrix(x_test, y_test)
        # set the group counts from the query IDs
        cls.dtrain.set_group(
            [len(list(items)) for _key, items in itertools.groupby(qid_train)])
        cls.dtest.set_group(
            [len(list(items)) for _key, items in itertools.groupby(qid_test)])
        cls.dvalid.set_group(
            [len(list(items)) for _key, items in itertools.groupby(qid_valid)])
        # save the query IDs for testing
        cls.qid_train = qid_train
        cls.qid_test = qid_test
        cls.qid_valid = qid_valid

        # model training parameters
        cls.params = {
            'objective': 'rank:pairwise',
            'booster': 'gbtree',
            'silent': 0,
            'eval_metric': ['ndcg']
        }
Exemple #14
0
def load_amazon(source_name, target_name, data_folder=None, verbose=False):
    """
    Load the amazon sentiment datasets from svmlight format files
    inputs:
        source_name : name of the source dataset
        target_name : name of the target dataset
        data_folder : path to the folder containing the files
    outputs:
        xs : training source data matrix
        ys : training source label vector
        xt : training target data matrix
        yt : training target label vector
        xtest : testing target data matrix
        ytest : testing target label vector
    """

    if data_folder is None:
        data_folder = 'data/'

    source_file = data_folder + source_name + '_train.svmlight'
    target_file = data_folder + target_name + '_train.svmlight'
    test_file = data_folder + target_name + '_test.svmlight'

    if verbose:
        print('source file:', source_file)
        print('target file:', target_file)
        print('test file:  ', test_file)

    xs, ys, xt, yt, xtest, ytest = load_svmlight_files(
        [source_file, target_file, test_file])

    # Convert sparse matrices to numpy 2D array
    xs, xt, xtest = (np.array(X.todense()) for X in (xs, xt, xtest))

    # Convert {-1,1} labels to {0,1} labels
    ys, yt, ytest = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, ytest))
    num_labels = len(set(ys))
    ys_onehot = np.eye(num_labels)[ys]
    yt_onehot = np.eye(num_labels)[yt]
    ytest_onehot = np.eye(num_labels)[ytest]

    return xs, ys_onehot, xt, yt_onehot, xtest, ytest_onehot
Exemple #15
0
def get_mq2008(dpath):
    from sklearn.datasets import load_svmlight_files

    src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
    target = dpath + '/MQ2008.zip'
    if not os.path.exists(target):
        urllib.request.urlretrieve(url=src, filename=target)

    with zipfile.ZipFile(target, 'r') as f:
        f.extractall(path=dpath)

    (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid, y_valid,
     qid_valid) = load_svmlight_files(
         (dpath + "MQ2008/Fold1/train.txt", dpath + "MQ2008/Fold1/test.txt",
          dpath + "MQ2008/Fold1/vali.txt"),
         query_id=True,
         zero_based=False)

    return (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid,
            y_valid, qid_valid)
def select_feature_multilabel(trainfilename, testfilename):
    def returnIG(X, y):
        return randval, p
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename),  multilabel=True)

    featurenum = X_train.shape[1]
    randval = randomValues(X_train, y_train)
    p = np.ones((featurenum,1), int)
    p.reshape(featurenum,1)

    featureNum = X_train.get_shape()[1]
    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(returnIG, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new = selector.transform(X_test)
        dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
Exemple #17
0
def run_nblcr(train,
              test,
              outfn,
              grams='123',
              clf=LogisticRegression(class_weight="auto")):
    f_train = outfn + '-train.txt'
    f_test = outfn + '-test.txt'

    ngram = [int(i) for i in grams]
    ptrain = []
    ntrain = []

    for _, row in train.iterrows():
        if row['label'] == 1:
            ptrain.append(tokenize(row['text'], ngram))
        elif row['label'] == 0:
            ntrain.append(tokenize(row['text'], ngram))

    pos_counts = build_dict(ptrain, ngram)
    neg_counts = build_dict(ntrain, ngram)

    dic, r = compute_ratio(pos_counts, neg_counts)

    generate_svmlight_file(train, dic, r, ngram, f_train)
    generate_svmlight_file(test, dic, r, ngram, f_test)

    X_train, y_train, X_test, _ = load_svmlight_files((f_train, f_test))

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    try:
        y_prob = clf.predict_proba(X_test)
    except:
        # for svm with probability output
        clf.set_params(probability=True)
        y_prob_pos = clf.predict(X_test)
        y_prob_neg = np.ones(X_test.shape[0]) - y_prob_pos
        y_prob = np.column_stack((y_prob_neg, y_prob_pos))

    return y_pred, y_prob
Exemple #18
0
def select_feature(trainfilename, testfilename):
    def returnIG(X, y):
        return ig, p
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))

    featurenum = X_train.shape[1]
    ig = information_gain(X_train, y_train)
    ig = ig.reshape(featurenum,)
    p = np.ones((1,featurenum), int)
    p.reshape(featurenum,1)

    featureNum = X_train.get_shape()[1]
    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(returnIG, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new = selector.transform(X_test)
        sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
Exemple #19
0
def run_nblcr(train, test, outfn, grams='123', clf=LogisticRegression(class_weight="auto")):
    f_train = outfn + '-train.txt'
    f_test = outfn + '-test.txt'
    
    ngram = [int(i) for i in grams]
    ptrain = []
    ntrain = []
        
    for _, row in train.iterrows():
        if row['label'] == 1:
            ptrain.append(tokenize(row['text'], ngram))
        elif row['label'] == 0:
            ntrain.append(tokenize(row['text'], ngram))
        
    pos_counts = build_dict(ptrain, ngram)
    neg_counts = build_dict(ntrain, ngram)
        
    dic, r = compute_ratio(pos_counts, neg_counts)
        
    generate_svmlight_file(train, dic, r, ngram, f_train)
    generate_svmlight_file(test, dic, r, ngram, f_test)
    
    X_train, y_train, X_test, _ = load_svmlight_files((f_train, f_test))
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    try:
        y_prob = clf.predict_proba(X_test)
    except:
        # for svm with probability output
        clf.set_params(probability=True)
        y_prob_pos = clf.predict(X_test)
        y_prob_neg = np.ones(X_test.shape[0]) - y_prob_pos
        y_prob = np.column_stack((y_prob_neg, y_prob_pos))
    
    return y_pred, y_prob
    
    
    
Exemple #20
0
def load_e2006():
    # laod data
    feature_tr, label_tr, feature_te, label_te = load_svmlight_files(['./data/E2006.train', \
            './data/E2006.test'], n_features=150360)
    feature = vstack([feature_tr, feature_te])
    # expand 1 dimension to labels
    label = np.concatenate([label_tr, label_te], axis=0)
    # remove outliers from labels
    std_y = np.std(label)
    mean_y = np.mean(label)
    mask = np.logical_and(label > mean_y - 3.0 * std_y, label < mean_y + 3.0 * std_y)
    print(f'keep {np.sum(mask)} / {len(mask)} rows')
    # select rows
    feature = feature[mask]
    label = label[mask]

    # scale labels by standard scaler
    label = label[:, None]
    scaler = MinMaxScaler()
    scaler.fit(label)
    label = scaler.transform(label).squeeze()
    return feature * 10, label
Exemple #21
0
def load_dmatrix(filename):
    '''
    NOTE(sneaxiy): XGBoost distributed training using rabit would
    split CSV/LIBSVM file into N pieces automatically, where N is
    the worker number. However, in our implementation, we dump
    different data file into each worker, and each worker should
    not split the dumped file again when training. Otherwise,
    some data would be lost. To prevent the automatic data sharding
    by XGBoost itself, we load the LIBSVM file using
    'sklearn.datasets.load_svmlight_file' to be a CSR sparse matrix
    first, and then convert it to 'xgboost.DMatrix'.

    See https://github.com/sql-machine-learning/sqlflow/issues/2326
    in detailed.
    '''
    if xgb.rabit.get_world_size() > 1:
        # XGBoost DMatrix supports to load data from file path like
        # "train.txt#train.txt.cache". The actual data path is
        # "train.txt", while "train.txt.cache" is used as the
        # external memory cache. But "train.txt#train.txt.cache"
        # is not a valid file path, and it is not supported by
        # load_svmlight_file(s). So we remove the suffix "#..."
        # here before loading the data using load_svmlight_file(s).
        if '#' in filename:
            filename = filename[0:filename.index('#')]

        if os.path.isdir(filename):
            files = [os.path.join(filename, f) for f in os.listdir(filename)]
            assert len(files) > 0, "No data file found in {}".format(filename)

            ret = load_svmlight_files(files, zero_based=True)
            X = vstack(ret[0::2])
            y = np.concatenate(ret[1::2], axis=0)
            return xgb.DMatrix(X, y, missing=XGBOOST_NULL_MAGIC)
        else:
            ret = load_svmlight_file(filename, zero_based=True)
            return xgb.DMatrix(ret[0], ret[1], missing=XGBOOST_NULL_MAGIC)
    else:
        return xgb.DMatrix(filename, missing=XGBOOST_NULL_MAGIC)
Exemple #22
0
def load_dataset(train_path, test_path, threshold=5):
    """
    Generator the yields SvmSet for each set(train, test)
    Loads the svml format file to Sickit-Learn svml-bases dataset
    and Normalize the values(the data with MinMaxScalar)
    If the score is above threshold(defult = 5) so normalized as 1(Positive) else, 0(Negative)
    :param train_path: train set path
    :param test_path: test set path
    :param threshold: threshold to define the pivot-value
    :return: None
    """
    files = [train_path, test_path]
    dataset = datasets.load_svmlight_files(files=files,
                                           zero_based=True,
                                           query_id=True,
                                           multilabel=False)
    for (x, y, qid) in [dataset[i:i + 3] for i in range(0, len(dataset), 3)]:
        x.data = preprocessing.MinMaxScaler().fit_transform(x.data)

        for idx, score in enumerate(y):
            y[idx] = 1 if score > threshold else 0

        yield SvmSet(x=x, y=y, qid=qid)
Exemple #23
0
def load_dataset(path_train,
                 path_valid,
                 path_test,
                 n_features,
                 multilabel=False,
                 classes_=None):
    le = LabelEncoder2(multilabel=multilabel)

    X, Y, Xvalid, Yvalid, Xtest, Ytest = load_svmlight_files(
        (path_train, path_valid, path_test),
        dtype=np.float32,
        n_features=n_features,
        multilabel=multilabel)
    if classes_ is None:
        le.fit(np.concatenate((Y, Yvalid, Ytest), axis=0))
        Y = le.transform(Y)
        Yvalid = le.transform(Yvalid)
        Ytest = le.transform(Ytest)
    else:
        le.set_classes(classes_)
        Y = le.transform(Y)
        Yvalid = le.transform(Yvalid)
    return X, Y, Xvalid, Yvalid, Xtest, Ytest
def load_amazon(source_name, target_name, data_folder=None, verbose=False):
    """
    Load the amazon sentiment datasets from svmlight format files
    inputs:
        source_name : name of the source dataset
        target_name : name of the target dataset
        data_folder : path to the folder containing the files
    outputs:
        xs : training source data matrix
        ys : training source label vector
        xt : training target data matrix
        yt : training target label vector
        xtest : testing target data matrix
        ytest : testing target label vector
    """

    if data_folder is None:
        data_folder = 'data/'

    source_file = data_folder + source_name + '_train.svmlight'
    target_file = data_folder + target_name + '_train.svmlight'
    test_file = data_folder + target_name + '_test.svmlight'

    if verbose:
        print('source file:', source_file)
        print('target file:', target_file)
        print('test file:  ', test_file)

    xs, ys, xt, yt, xtest, ytest = load_svmlight_files([source_file, target_file, test_file])

    # Convert sparse matrices to numpy 2D array
    xs, xt, xtest = (np.array(X.todense()) for X in (xs, xt, xtest))

    # Convert {-1,1} labels to {0,1} labels
    ys, yt, ytest = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, ytest))

    return xs, ys, xt, yt, xtest, ytest
Exemple #25
0
def mxTrainer(relationName, train, test, train_pair, test_pair):
    X_train, y_train, X_test, y_test = load_svmlight_files([train, test])
    X_train_col = X_train.shape[1]
    X_test_col = X_test.shape[1]
    col = max(X_test_col, X_train_col)
    train_iter = mx.io.LibSVMIter(data_libsvm=train,
                                  data_shape=(col, ),
                                  batch_size=100)
    test_iter = mx.io.LibSVMIter(data_libsvm=test,
                                 data_shape=(col, ),
                                 batch_size=100)
    print(test_iter)
    mod = rankNet()
    mod.bind(data_shapes=train_iter.provide_data,
             label_shapes=train_iter.provide_label)
    mod.fit(train_iter, num_epoch=5, optimizer="AdaGrad")
    y_pred = mod.predict(test_iter)
    print(relationName + str(y_pred.shape) + str(col))
    y_pred = y_pred.asnumpy().reshape(y_pred.shape[0])
    print(str(y_pred.shape) + str(y_test.shape))
    test_pair['score'] = y_pred
    print(roc_auc_score(y_true=y_test.reshape[y_test.shape[0]],
                        y_score=y_pred))
    writeScoresInPraStyle(test_pair, train_pair, relationName)
Exemple #26
0
def main():
    args = get_args()

    # Load training data
    data_train = load_svmlight_files(args.input)
    X_train = vstack(data_train[0::2]).toarray()
    y_train = vstack(data_train[1::2]).toarray()

    # Make model
    if args.model == 'rf':
        model = RandomForestClassifier()
        param_grid = rf_param_grid()
    elif args.model == 'svm_rbf':
        model = SVC()
        param_grid = svm_rbf_param_grid()

    # Grid search hyperparameters
    grid_search = GridSearchCV(estimator=model, scoring='average_precision', param_grid=param_grid,
                               cv=KFold(len(X_train), n_folds=args.kfolds, shuffle=True, random_state=args.seed),
                               n_jobs=args.processes, verbose=2)

    grid_search.fit(X_train, y_train)

    pkl.dump(grid_search, open('temp.pkl', 'wb'))
    probs.append(score_i)
      
  return probs
  
parser = argparse.ArgumentParser()
#parser.add_argument( "train_file" )
parser.add_argument( "-p", "--predict", help = "if is to make predictions in a test file", default = None )
parser.add_argument( "-t", "--predict_file", help = "if is to make predictions in a test file", default = None )
parser.add_argument( "-c", "--cross_validation", help = "if have make cross-validation", default = None )

args = parser.parse_args()

classifier = LDA(n_components=2)
#classifier = RandomForestClassifier()

X_url, y, X_title, y_t, X_body, y_b, X_a, y_a = load_svmlight_files(("url_train.txt", "title_train.txt", "body_train.txt", "all_train.txt"))
X = {"url":X_url, "title": X_title, "body": X_body, "all": X_a}

if(args.predict):
  print "Predicting"
  T_url, t, T_title, y_t, T_body, y_b, T_a, y_a = load_svmlight_files(("url_test.txt", "title_test.txt", "body_test.txt", "all_test.txt"))
  T = {"url": T_url, "title": T_title, "body": T_body, "all": T_a}
  probs = predict(classifier, X, y, T, t)
  
  f = open("sub_31-08_01h15.txt","w")
  f.write("label\n")
  for p in probs:
    line = "%f\n" % p
    f.write(line)
  f.close()
elif(args.cross_validation):
Exemple #28
0
from sklearn.datasets import load_svmlight_files


def documentFrequency(X, y):
    featurenum = X.shape[1]
    s = sum(X).toarray()
    p = np.ones((1, featurenum), int)
    return s.reshape(featurenum), p.reshape(featurenum, 1)


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Usage: python threshold trainfilename testfilename"
        exit(1)
    trainfilename = sys.argv[2]
    testfilename = sys.argv[3]
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))

    df = sum(X_train).toarray()[0]
    cnt = 0
    threshold = int(sys.argv[1])
    for i in range(0, len(df)):
        if df[i] >= threshold:
            cnt = cnt + 1
    selector = SelectKBest(documentFrequency, k=cnt)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)
    sklearn.datasets.dump_svmlight_file(X_train, y_train, trainfilename + "_" + str(cnt), zero_based=False)
    sklearn.datasets.dump_svmlight_file(X_test, y_test, testfilename + "_" + str(cnt), zero_based=False)
    print cnt, "features selected"
Exemple #29
0
    # remove axis spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)

    plt.grid()
    plt.tight_layout
    plt.show()
    
    
    
os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\dir_data")



X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
np.unique(y_train)

sklearn_lda = LDA(n_components=30)
X_lda_sklearn = sklearn_lda.fit_transform(X_train.todense(), y_train)
plot_scikit_lda(X_lda_sklearn, title='LDA vision_cuboids_histogram')
# PCA
sklearn_pca = sklearnPCA(n_components=30)
X_pca = sklearn_pca.fit_transform(X_train.todense())
plot_pca(title = 'PCA vision_cuboids_histogram')
#
X_ldapca_sklearn = sklearn_pca.fit_transform(X_lda_sklearn)
plot_scikit_lda(X_ldapca_sklearn, title='LDA+PCA LDA vision_cuboids_histogram', mirror=(-1))
    letter_dataset = np.genfromtxt('realData/letter-recognition.data',
                                   delimiter=",",
                                   converters={0: letter_label})
    ndim = len(letter_dataset[0])
    data = np.zeros((len(letter_dataset), ndim - 1))
    target = np.zeros((len(letter_dataset)))
    for i in xrange(len(letter_dataset)):
        target[i] = letter_dataset[i][0].astype(int)  # last feature is label
        for j in xrange(1, ndim):
            data[i, j - 1] = letter_dataset[i][j]
    target = target.astype(int)

elif (dataset_name == "gas"):
    files = ["realData/Gas/batch" + str(i) + ".dat" for i in xrange(1, 11)]
    batches = skd.load_svmlight_files(files)

    data = batches[0].todense()
    target = batches[1]
    len_target = len(target)
    target = np.reshape(target, (len_target, 1))

    for idx in xrange(2, 11):
        batch_data = batches[(idx - 1) * 2].todense()
        batch_target = batches[2 * idx - 1]
        len_batch_target = len(batch_target)
        batch_target = np.reshape(batch_target, (len_batch_target, 1))

        data = np.concatenate((data, batch_data), axis=0)
        target = np.concatenate((target, batch_target), axis=0)
Exemple #31
0
	加载数据
'''
fe_dir = 'xgb_feature_pool'
fe_file = 'selected_feature.csv'
train_date = '2016-04-06'
valid_date = '2016-04-10'
test_date = '2016-04-16'

train_data = pd.read_csv(os.path.join(fe_dir, train_date, fe_file))
valid_data = pd.read_csv(os.path.join(fe_dir, valid_date, fe_file))
test_data = pd.read_csv(os.path.join(fe_dir, test_date, fe_file))


train_sparse_file = 'XGB输出的稀疏特征/v3.train.svm'
valid_sparse_file = 'XGB输出的稀疏特征/v3.valid.svm'
X_train_sparse, _, X_valid_sparse, _ = load_svmlight_files([train_sparse_file, valid_sparse_file])

valid_data_9 = pd.read_csv(os.path.join(fe_dir, '2016-04-09', fe_file))
valid_data_11 = pd.read_csv(os.path.join(fe_dir, '2016-04-11', fe_file))

train_data.fillna(-1, inplace=True)
valid_data.fillna(-1, inplace=True)
test_data.fillna(-1, inplace=True)
valid_data_9.fillna(-1, inplace=True)
valid_data_11.fillna(-1, inplace=True)

print 'Read done.'


'''
    得到交叉验证索引
    model = Word2Vec.load(model_name)
    
    print "Creating the w2v vectors...\n"

    X_train_w2v = scale(getAvgFeatureVecs(getCleanReviews(train), model, n_dim))
    X_test_w2v = scale(getAvgFeatureVecs(getCleanReviews(test), model, n_dim))
    
    print "Generating the svmlight-format files...\n"
    
    generate_svmlight_files(train, test, '123', '../data/nbsvm')
    
    print "Creating the nbsvm...\n"
    
    files = ("../data/nbsvm-train.txt", "../data/nbsvm-test.txt")
     
    X_train_nbsvm, _, X_test_nbsvm, _ = load_svmlight_files(files)
    
    print "Combing the bag of words and the w2v vectors...\n"
    
    X_train_bwv = hstack([X_train_bow, X_train_w2v])
    X_test_bwv = hstack([X_test_bow, X_test_w2v])

    
    print "Combing the bag of words and the d2v vectors...\n"
    
    X_train_bdv = hstack([X_train_bow, X_train_d2v])
    X_test_bdv = hstack([X_test_bow, X_test_d2v])

    
    print "Checking the dimension of training vectors"
    
def textpCoverX():
    os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train")
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    data_df = pd.DataFrame()

    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []

    for file in glob.glob("text*.gz"):
        print(file)
        X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(
            (gzip.open(path + "train\\" + file),
             gzip.open(path + "test\\" + file),
             gzip.open(path + "validation\\" + file)))

        X_train = X_train[y_train != 31]
        X_test = X_test[y_test != 31]
        X_val = X_val[y_val != 31]
        y_train = y_train[y_train != 31]
        y_test = y_test[y_test != 31]
        y_val = y_val[y_val != 31]

        svmClf = Pipeline([
            ('clf',
             SGDClassifier(loss='log',
                           penalty='l1',
                           alpha=1e-6,
                           n_iter=10,
                           random_state=88)),
        ])
        svmClf = svmClf.fit(X_train, y_train)

        predicted_train = svmClf.predict(X_train)
        train_acc = np.mean(predicted_train == y_train)
        print "Train Model Accuracy %f" % train_acc
        train_post = pd.DataFrame(svmClf.predict_proba(X_train))

        predicted_test = svmClf.predict(X_test)
        test_acc = np.mean(predicted_test == y_test)
        print "Test Model Accuracy %f" % test_acc
        test_post = pd.DataFrame(svmClf.predict_proba(X_test))

        predicted_val = svmClf.predict(X_val)
        val_acc = np.mean(predicted_val == y_val)
        print "Validation Model Accuracy %f" % val_acc
        val_post = pd.DataFrame(svmClf.predict_proba(X_val))

        train_entropy = entropy(train_post)

        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)

        test_entropy = entropy(test_post)

        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)

        val_entropy = entropy(val_post)

        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)

        temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]])
        data_df = data_df.append(temp, ignore_index=True)

    return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df
Exemple #34
0
def test_load_invalid_file2():
    with pytest.raises(ValueError):
        load_svmlight_files([datafile, invalidfile, datafile])
Exemple #35
0
'''
	加载数据
'''
fe_dir = 'xgb_feature_pool'
fe_file = 'selected_feature.csv'
train_date = '2016-04-06'
valid_date = '2016-04-10'
test_date = '2016-04-16'

train_data = pd.read_csv(os.path.join(fe_dir, train_date, fe_file))
valid_data = pd.read_csv(os.path.join(fe_dir, valid_date, fe_file))
test_data = pd.read_csv(os.path.join(fe_dir, test_date, fe_file))

train_sparse_file = 'XGB输出的稀疏特征/v3.train.svm'
valid_sparse_file = 'XGB输出的稀疏特征/v3.valid.svm'
X_train_sparse, _, X_valid_sparse, _ = load_svmlight_files(
    [train_sparse_file, valid_sparse_file])

valid_data_9 = pd.read_csv(os.path.join(fe_dir, '2016-04-09', fe_file))
valid_data_11 = pd.read_csv(os.path.join(fe_dir, '2016-04-11', fe_file))

train_data.fillna(-1, inplace=True)
valid_data.fillna(-1, inplace=True)
test_data.fillna(-1, inplace=True)
valid_data_9.fillna(-1, inplace=True)
valid_data_11.fillna(-1, inplace=True)

print 'Read done.'
'''
    得到交叉验证索引
'''
from sklearn.model_selection import KFold
        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)

        temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]])
        data_df = data_df.append(temp, ignore_index=True)

    return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df


#=============================================== Main =================================================================

#os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\dir_data")
os.chdir("C:\Users\Vaibhav\Desktop\dir_data\dir_data")
X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(
    ("train\\vision_hist_motion_estimate.txt",
     "test\\vision_hist_motion_estimate.txt",
     "validation\\vision_hist_motion_estimate.txt"))

#================ First Level of Fusion - Audio ===============================
train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df = pCoverX(
    'audio')
data_df.columns = [
    'filename', 'train Accuracy', 'test Accuracy', 'validation Accuracy'
]
data_df.to_csv('Audio_preComb_Acc.csv', index=False)

alpha = 1
comb1_audio_train = combiner(train_post_array, train_entropy_array, alpha)
comb1_audio_test = combiner(test_post_array, test_entropy_array, alpha)
comb1_audio_val = combiner(val_post_array, val_entropy_array, alpha)
def test_load_invalid_file2():
    load_svmlight_files([datafile, invalidfile, datafile])
Exemple #38
0
import numpy as np
from scipy.sparse import csr_matrix
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from mlclas.ensemble import BinaryRelevance, ClassifierChains, CalibratedLabelRanking, RandomKLabelsets, MLKNN
from mlclas.tree import MLDecisionTree
from mlclas.neural import BPMLL
from mlclas.svm import RankingSVM
from mlclas.stats import UniversalMetrics

files = ['datasets/scene_train', 'datasets/scene_test']

# load files
data = datasets.load_svmlight_files(files, multilabel=True)
train_data = data[0]
train_target = np.array(MultiLabelBinarizer().fit_transform(data[1]))
test_data = data[2]
test_target = data[3]

# feature extraction using PCA
feature_size = train_data.shape[1]
pca = PCA(n_components=(feature_size * 10) // 100)
train_data_trans = csr_matrix(pca.fit_transform(train_data.todense()))
test_data_trans = csr_matrix(pca.transform(test_data.todense()))

"""
    train and predict using any of following scripts:

    1.  result = BinaryRelevance(LinearSVC()).fit(train_data, train_target).predict(test_data)
Exemple #39
0
        x = range(len(data))
        plt.xticks(x,data[data.columns[0]],rotation='vertical')
        for i in range(1,len(data.columns)):
            plt.plot(x,data[data.columns[i]])
        
    plt.legend(data.columns[1:], loc='upper left')
    plt.xlabel(data.columns[0])
    plt.ylabel('Accuracy')
    plt.title('Accuracy plot for ' + fileName)
    plt.show()

#=============================================== Main =================================================================

os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\Capstone")
#os.chdir("C:\Users\Vaibhav\Desktop\dir_data\dir_data")
X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_hist_motion_estimate.txt", "test\\vision_hist_motion_estimate.txt","validation\\vision_hist_motion_estimate.txt"))
y_train = y_train[y_train!=31]
y_test = y_test[y_test!=31]
y_val = y_val[y_val!=31]
#y_train = y_train[y_train <=2]
#y_test = y_test[y_test<=2]
#y_val = y_val[y_val<=2]
#================ First Level of Fusion - Audio ===============================
n_guass =5
nClass = 30 
train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df = pCoverX('audio',n_guass,tech = 'LinearSVC',C= 0.5,nClass=30)
data_df.columns = ['filename','train Accuracy','test Accuracy','validation Accuracy']
data_df.to_csv('Audio_preComb_Acc0801.csv',index=False)

audioComb1Acc = pd.DataFrame()
for alpha in [1,2,3,4,5]:
Exemple #40
0
def textpCoverX():
    #os.chdir("F:\\Analytics\\ISB Study\\Capstone\\dir_data\\dir_data\\train")
    #path = "F:\\Analytics\\ISB Study\\Capstone\\dir_data\\dir_data\\"
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    os.chdir(path+'train')
    
    data_df = pd.DataFrame()
    
    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []
    
    for file in glob.glob("text*.gz"):
        print(file)
        X_train, y_train, X_test, y_test,X_val, y_val = load_svmlight_files((gzip.open(path+"train\\"+file), gzip.open(path+"test\\"+file),gzip.open(path+"validation\\"+file)))    
            
        X_train = X_train[y_train!=31]
        X_test = X_test[y_test!=31]
        X_val = X_val[y_val!=31]
        y_train = y_train[y_train!=31]
        y_test = y_test[y_test!=31]
        y_val = y_val[y_val!=31]
        
        svmClf = Pipeline([ ('clf', SGDClassifier(loss='log', penalty='l1',alpha=1e-6, n_iter=10, random_state=88)),])
        svmClf = svmClf.fit(X_train, y_train)
        
        predicted_train = svmClf.predict(X_train)
        train_acc = np.mean(predicted_train == y_train)     
        print "Train Model Accuracy %f" % train_acc    
        train_post = pd.DataFrame(svmClf.predict_proba(X_train))
        
        predicted_test = svmClf.predict(X_test)
        test_acc = np.mean(predicted_test == y_test)        
        print "Test Model Accuracy %f" % test_acc
        test_post = pd.DataFrame(svmClf.predict_proba(X_test))    
        
        predicted_val = svmClf.predict(X_val)
        val_acc = np.mean(predicted_val == y_val)     
        print "Validation Model Accuracy %f" % val_acc
        val_post = pd.DataFrame(svmClf.predict_proba(X_val))    
        
        
        train_entropy = entropy(train_post)
        
        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)
    
        test_entropy = entropy(test_post)
    
        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)
        
        val_entropy = entropy(val_post)
    
        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)
        
        temp = pd.DataFrame([[file,train_acc,test_acc,val_acc]])        
        data_df = data_df.append(temp,ignore_index =True)
        
    return train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df
Exemple #41
0
#splitter
from sklearn.datasets import load_svmlight_files
trn_X, trn_y, tst_X, tst_y = load_svmlight_files(("C:/Users/Ryan/git/nlp/trn.dat", "C:/Users/Ryan/git/nlp/tst.dat"))
print trn_X.shape[1]
from sklearn.datasets import load_svmlight_files
from sklearn import svm
from sklearn.preprocessing import normalize, label_binarize
from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV
import pylab as pl

# read training data and validation data merged using cat in satimage.scale.train
X_train, Y_train, X_test, Y_test = load_svmlight_files(["satimage.scale.train","satimage.scale.t"])

# normalize & binarize
X_train = normalize(X_train)
Y_train = label_binarize(Y_train,classes=[1,2,3,4,5,6])[:,5]
X_test = normalize(X_test)
Y_test = label_binarize(Y_test,classes=[1,2,3,4,5,6])[:,5]

# build the classifier

def svm_score(c,d):
    clf = svm.SVC(C=c,kernel='poly',degree=d)
    kfold = KFold(len(Y_train), n_folds=5)
    scores = cross_val_score(clf,X_train,Y_train,cv=kfold,n_jobs=-1)
    return scores.mean()

x = pl.linspace(1,20,20)
y1=[]
y2=[]
y3=[]
for i in x:
    y1.append(svm_score(i,1))
    y2.append(svm_score(i,2))
def get_data(type):
    return load_svmlight_files(
        ("../data/Fold1/" + type, "../data/Fold2/" + type,
         "../data/Fold3/" + type, "../data/Fold4/" + type,
         "../data/Fold5/" + type))
def fiveFold():

    # Feature groups
    # protocol_dependent = range(13) + range(66,69)
    # protocol_dependent = range(23) + range(66,69)
    # peak features
    # protocol_dependent = range(23,41)
    # All but peak
    # protocol_dependent = range(23) + range(41,69)
    fsslv_cipher_suites = [6, 7, 8, 9, 10, 11, 12]
    protocol_dependent = []

    # Load data
    data_path = os.getcwd() + "/data_set/libSVM"

    train_0 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_0_train"
    test_0 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_0_test"
    train_1 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_1_train"
    test_1 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_1_test"
    train_2 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_2_train"
    test_2 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_2_test"
    train_3 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_3_train"
    test_3 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_3_test"
    train_4 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_4_train"
    test_4 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_4_test"

    X_train_0, y_train_0, X_test_0, y_test_0 = load_svmlight_files(
        (train_0, test_0))
    X_train_1, y_train_1, X_test_1, y_test_1 = load_svmlight_files(
        (train_1, test_1))
    X_train_2, y_train_2, X_test_2, y_test_2 = load_svmlight_files(
        (train_2, test_2))
    X_train_3, y_train_3, X_test_3, y_test_3 = load_svmlight_files(
        (train_3, test_3))
    X_train_4, y_train_4, X_test_4, y_test_4 = load_svmlight_files(
        (train_4, test_4))

    df_train_0 = pd.DataFrame(X_train_0.toarray())
    df_test_0 = pd.DataFrame(X_test_0.toarray())
    df_train_1 = pd.DataFrame(X_train_1.toarray())
    df_test_1 = pd.DataFrame(X_test_1.toarray())
    df_train_2 = pd.DataFrame(X_train_2.toarray())
    df_test_2 = pd.DataFrame(X_test_2.toarray())
    df_train_3 = pd.DataFrame(X_train_3.toarray())
    df_test_3 = pd.DataFrame(X_test_3.toarray())
    df_train_4 = pd.DataFrame(X_train_4.toarray())
    df_test_4 = pd.DataFrame(X_test_4.toarray())

    X_train_0 = df_train_0.drop(protocol_dependent, axis=1)
    X_test_0 = df_test_0.drop(protocol_dependent, axis=1)
    X_train_1 = df_train_1.drop(protocol_dependent, axis=1)
    X_test_1 = df_test_1.drop(protocol_dependent, axis=1)
    X_train_2 = df_train_2.drop(protocol_dependent, axis=1)
    X_test_2 = df_test_2.drop(protocol_dependent, axis=1)
    X_train_3 = df_train_3.drop(protocol_dependent, axis=1)
    X_test_3 = df_test_3.drop(protocol_dependent, axis=1)
    X_train_4 = df_train_4.drop(protocol_dependent, axis=1)
    X_test_4 = df_test_4.drop(protocol_dependent, axis=1)

    # X_train_0 = randomProtocolValues(X_train_0)
    # X_test_0 = randomProtocolValues(X_test_0)
    # X_train_1 = randomProtocolValues(X_train_1)
    # X_test_1 = randomProtocolValues(X_test_1)
    # X_train_2 = randomProtocolValues(X_train_2)
    # X_test_2 = randomProtocolValues(X_test_2)
    # X_train_3 = randomProtocolValues(X_train_3)
    # X_test_3 = randomProtocolValues(X_test_3)
    # X_train_4 = randomProtocolValues(X_train_4)
    # X_test_4 = randomProtocolValues(X_test_4)

    # Prepare ensemble method
    estimators = []
    model1 = KNeighborsClassifier(n_neighbors=16,
                                  algorithm='ball_tree',
                                  metric='canberra',
                                  n_jobs=-1)
    estimators.append(('knn', model1))
    model2 = SVC(gamma=0.0078125, C=8192, probability=False)
    estimators.append(('svmrbf', model2))
    model3 = DecisionTreeClassifier()  #max_depth=50)
    estimators.append(('DecisionTree', model3))
    model4 = RandomForestClassifier(n_estimators=100,
                                    oob_score=True,
                                    n_jobs=-1)
    estimators.append(('RandomForest', model4))
    model5 = XGBClassifier(max_depth=10, n_estimators=100, learning_rate=0.1)
    estimators.append(('XGBoost', model5))

    # ensemble = VotingClassifier(estimators,voting='hard')
    ensemble = CategoryClassifier()

    # CategoricalEnsembleVoting(X_train_0, y_train_0, X_test_0, y_test_0)
    oneFold(X_train_0, y_train_0, X_test_0, y_test_0, ensemble)
    oneFold(X_train_1, y_train_1, X_test_1, y_test_1, ensemble)
    oneFold(X_train_2, y_train_2, X_test_2, y_test_2, ensemble)
    oneFold(X_train_3, y_train_3, X_test_3, y_test_3, ensemble)
    oneFold(X_train_4, y_train_4, X_test_4, y_test_4, ensemble)
Exemple #45
0
    data_x, data_y = data[0][:1000, :5], data[1][:1000]
    data_y_binary = (data_y > 5).astype(np.int32)

    print("Binary classification")
    print("training model")
    model = xgboost.XGBClassifier(n_estimators=10)
    model.fit(data_x, data_y_binary)

    features = ["f{0}".format(i) for i in range(data_x.shape[1])]
    target_names = [
        "cls{0}".format(i) for i in range(len(np.unique(data_y_binary)))
    ]

    bdt = BDTxgboost(model, features, target_names)
    bdt.to_tmva("test.xml")
    bdt.setup_tmva("test.xml")

    d1 = 0.0
    for irow in range(data_x.shape[0]):
        predA1 = bdt.eval_tmva(data_x[irow, :])
        predB1 = bdt.eval(data_x[irow, :])
        d1 += np.abs((predA1 - predB1) / predA1)


if __name__ == "__main__":
    print("fetching data")
    data = load_svmlight_files(("usps", "usps.t"))

    #simple_test_xgboost()
    unittest.main()
def pCoverX(featureFamily):
    os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train")
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    data_df = pd.DataFrame()
    n_guass = 2
    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []
    fileType = featureFamily + '*.gz'
    for file in glob.glob(fileType):
        print(file)
        X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(
            (gzip.open(path + "train\\" + file),
             gzip.open(path + "test\\" + file),
             gzip.open(path + "validation\\" + file)))
        #X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
        X_train = X_train[y_train != 31]
        X_test = X_test[y_test != 31]
        X_val = X_val[y_val != 31]
        y_train = y_train[y_train != 31]
        y_test = y_test[y_test != 31]
        y_val = y_val[y_val != 31]
        #========================= Feature Selection using Variance Thresold =============================================================
        X_train_new, X_test_new, X_val_new = featureSelection(X_train,
                                                              X_test,
                                                              X_val,
                                                              y_train,
                                                              log=True,
                                                              tech='LinearSVC')
        #========================= Mixture of Guassian ============================================================
        train_prob, test_prob, val_prob = pXoverC(X_train_new, y_train,
                                                  X_test_new, y_test,
                                                  X_val_new, y_val, n_guass)
        #========================= Calculating Prior, Posterior and Entropy ============================================================
        prr = prior(y_train)
        train_post = posterior(train_prob, prr)
        train_entropy = entropy(train_post)

        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)

        test_post = posterior(test_prob, prr)
        test_entropy = entropy(test_post)

        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)

        val_post = posterior(val_prob, prr)
        val_entropy = entropy(val_post)

        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)

        train_acc, c_mat = checkAccuracy(train_post, y_train)
        test_acc, c_mat = checkAccuracy(test_post, y_test)
        val_acc, c_mat = checkAccuracy(val_post, y_val)
        temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]])
        data_df = data_df.append(temp, ignore_index=True)

    return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df
Exemple #47
0
def svm_skin(X_train, y_train, X_test, y_test):
    """Learn the skin data sets with SVM with Linear kernel.

    X_*: Samples.
    y_*: labels.
    """
    print 'SVM w/ Linear kernel'
    clf = svm.LinearSVC()
    clf.fit(X_train, y_train)
    score = 100 * clf.score(X_test.toarray(), y_test)

    print 'SVM score: %.2f%%' % score
    return score


if __name__ == '__main__':
    # `data_size` is an integer which controls how big the data set is.
    # Use none for to use the whole dataset.
    # split_libsvm_dataset(path='skin.txt', data_size=None)

    # Load train and test samples (X) + labels (y).
    X_train, y_train, X_test, y_test = load_svmlight_files(
        ('skin-train.libsvm', 'skin-test.libsvm'))

    svm_skin(X_train, y_train, X_test, y_test)

    # iterations, scores = adaboost_skin(X_train, y_train, X_test, y_test)
    # graph = plot_success_per_size(iterations, scores)
    # show()
Exemple #48
0
    for key in data:
        print('{"%s":"%s"}' % (key, data[key]))


write_dict({'pca_file': 'pca_plot.png'})

import datetime
print(datetime.datetime.now())

from numpy import genfromtxt
print("feature_file:", feature_file)

if '.csv' in feature_file:
    X = genfromtxt(feature_file, delimiter=',')
elif '.libsvm' in feature_file:
    X, y = datasets.load_svmlight_files([feature_file])
    X = X.toarray()
#    y = y.toarray()

svc = sc.load_model(args.model_file)

if svc == None:
    svc = svm.SVC(C=args.C,
                  kernel=args.kernel,
                  degree=args.degree,
                  gamma=args.gamma,
                  coef0=args.coef0,
                  shrinking=args.shrinking,
                  probability=args.probability,
                  tol=args.tol,
                  cache_size=args.cache_size,
Exemple #49
0
===================================================

In this example we show how to handle LIBSVM file format.
"""

from sklearn.datasets import load_svmlight_files
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load LIBSVM files.
# Note that these example files are not included in this repository.
# You can fetch them from: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20
print("Loading LIBSVM files...")
(train_X, train_y, test_X, test_y) = load_svmlight_files(['news20', 'news20.t'])

# Create a Train Dataset.
print("Creating train dataset...")
train_ds = Dataset.from_matrix(train_X, train_y)

# Create a Test Dataset
print("Creating test dataset...")
test_ds = Dataset.from_matrix(test_X, test_y)

# Create a Classifier Service
classifier = Classifier.run(Config())

# Train the classifier.
print("Training...")
for (idx, _) in classifier.train(train_ds):
def test_load_invalid_file2():
    load_svmlight_files([datafile, invalidfile, datafile])
Exemple #51
0
# import pdb
# pdb.set_trace()
# print "Features: ", len(data["data"][0])
# print "Instances: ", len(data["data"])
# print len(set(data["target"]))

data = datasets.load_mlcomp()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
import pdb
pdb.set_trace()

data = datasets.load_sample_image()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))

data = datasets.load_sample_images()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))

data = datasets.load_svmlight_file()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))

data = datasets.load_svmlight_files()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))