def _run(parameters): print(parameters) dataset_params, label_params = parameters label, params = label_params dataset = logistic.PyDataset(**dataset_params) dataset.load() model = logistic.PygisticSGD(**params) print('run model {}'.format(label)) res = model.fit(dataset) print('done model {}'.format(label)) return label, res
import os import logistic import pickle import numpy as np directory = "results" if not os.path.exists(directory): os.makedirs(directory) # create datasets madelon = logistic.PyDataset("madelon", "/Users/jb/data/madelon.txt", 2000, 500, False) rcv1 = logistic.PyDataset("rcv1", "/mlodata1/jb/data/rcv1_train.binary", 20242, 47236, True) rcv1_test = logistic.PyDataset("rcv1_test", "/mlodata1/jb/data/rcv1_test.binary", 677399, 47236, True) epsilon = logistic.PyDataset("epsilon", "/mlodata1/jb/data/epsilon_normalized", 400000, 2000, True) # """ # RCV1-test theory # """ # if not rcv1_test.is_loaded(): # rcv1_test.load() # # num_samples = 677399 # num_features = 47236 # dataset = rcv1_test # dataset_name = "RCV1" # file = "rcv1-th.pickle"
# # CREATE PARAMS FOR DATASET # dataset_params = { "name": "rcv1_test", "inputFile": "/mlodata1/jb/data/rcv1_test.binary", "numSamples": 677399, "numFeatures": 47236, "is_sparse": True } num_features = dataset_params["numFeatures"] num_samples = dataset_params["numSamples"] dataset = logistic.PyDataset(**dataset_params) # # CREATE PARAMS FOR EXPERIMENT # common_params = { "numEpochs": 20, "lrType": "optimal", "lr": 1., "tau": 10 * num_features, "lambda_": 1. / num_samples, "printPerEpoch": 20, "weightingScheme": 'final', }
from sklearn.linear_model import SGDClassifier from sklearn.datasets import load_svmlight_file import time import numpy as np repeat = 3 # #RVV1 # rcv1_test = logistic.PyDataset("rcv1_test", "/mlodata1/jb/data/rcv1_test.binary", 677399, 47236, True) # svm_path = "/mlodata1/jb/data/rcv1_test.binary" # dataset = rcv1_test # num_samples = 677399 # print("RUN ON RCV1") #epsilon epsilon = logistic.PyDataset("epsilon", "/mlodata1/jb/data/epsilon_normalized", 400000, 2000, True) svm_path = "/mlodata1/jb/data/epsilon_normalized" dataset = epsilon num_samples = 400000 print("RUN ON epsilon") dataset.load() res = np.zeros(repeat) for i in range(repeat): model = logistic.PygisticSGD(numEpochs=1, lrType="bottou", lr=.1, tau=1., lambda_=1. / num_samples, weightingScheme='final',