def score_ngca_on_clover_data_by_svm(alpha1, alpha2, beta1, beta2): # get samples and labels from train and validation data train_shuffled_data = utilities.download_data('cloverDataShuffledTrn') train_data = utilities.download_data('cloverDataTrn') kmeans_train_data = KMeans(n_clusters=4, random_state=0).fit(train_data) # Get 4 clusters labels train_labels = kmeans_train_data.labels_ # validation_shuffled_data = utilities.download_data('cloverDataShuffledVdn') # validation_data = utilities.download_data('cloverDataVdn') # kmeans_validation_data = KMeans(n_clusters=4, random_state=0).fit(validation_data) # Get 4 clusters labels # validation_labels = kmeans_validation_data.labels_ # Run algorithm on samples from train data train_samples, train_samples_copy = utilities.download_data('cloverDataShuffledTrn', separate_data=True) approx_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, alpha1, alpha2, beta1, beta2) # in case subspace is not of dimension 2 then return the worst score if approx_ng_subspace.shape[1] != 2: print('subspace dimension is not 2') return 1 # Project train data on the result subspace proj_train_shuffled_data = np.dot(train_shuffled_data, approx_ng_subspace) # build SVM classifier - fit by train data and check predication of validation data clf = SVC(kernel='rbf', C=500, gamma=0.1) clf.fit(proj_train_shuffled_data, train_labels) predicted_train_labels = clf.predict(proj_train_shuffled_data) # assign score score = clf.score(proj_train_shuffled_data, train_labels) # score by SVM model # score = adjusted_rand_score(train_labels, predicted_train_labels) return 1 - score # we want to minimize score
def evaluate_test_data_by_svm(algorithm_params): # get samples from test data test_samples, test_samples_copy = utilities.download_data('DataTst', separate_data=True) train_samples, train_samples_copy = utilities.download_data('DataTrn', separate_data=True) # get samples and labels from train and test data train_data = utilities.download_data('DataTrn') train_labels = utilities.download_labels('DataTrn') test_data = utilities.download_data('DataTst') test_labels = utilities.download_labels('DataTst') # Run algorithm on samples from test data approx_train_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, algorithm_params) approx_test_ng_subspace = run_ngca_algorithm(test_samples, test_samples_copy, algorithm_params) # Project train data on the result subspace proj_train_data = np.dot(train_data, approx_train_ng_subspace) proj_test_data = np.dot(test_data, approx_test_ng_subspace) # build SVM classifier - fit by train data and check predication (score) of test data # clf = SVC(gamma='auto') clf = SVC(kernel='rbf', C=500, gamma=0.1) clf.fit(proj_train_data, train_labels) # assign score score = clf.score(proj_test_data, test_labels) print('Score on test data:') utilities.print_score(score) plot_2d_data(proj_test_data, test_labels, algorithm_params) plot_3d_data(proj_test_data, test_labels, algorithm_params) return
def evaluate_test_data_by_kmeans(algorithm_params): # get samples from test data test_samples, test_samples_copy = utilities.download_data( 'DataTst', separate_data=True) # get samples and labels from test data test_data = utilities.download_data('DataTst') test_labels = utilities.download_labels('DataTst') # Run algorithm on samples from train data approx_ng_subspace = run_ngca_algorithm(test_samples, test_samples_copy, algorithm_params) # Project test data on the result subspace proj_data = np.dot(test_data, approx_ng_subspace) # evaluate data clustering by algorithm score = utilities.get_result_score_by_kmeans(proj_data, test_labels, 3) print('Score on test data:') utilities.print_score_fixed(score) # plot data in 2D & 3D plot_2d_data(proj_data, algorithm_params, test_labels) plot_3d_data(proj_data, algorithm_params, test_labels) return
def scoring_by_kmeans(): # get samples from train data train_samples, train_samples_copy = utilities.download_data( 'DataTrn', separate_data=True) # get samples and labels from validation data validation_data = utilities.download_data('DataVdn') validation_labels = utilities.download_labels('DataVdn') algorithm_params = { 'alpha1': 0.7, 'alpha2': 0.3, 'beta1': 0.34, 'beta2': 0.64, } # Run algorithm on samples from train data approx_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, algorithm_params) # Project validation data on the result subspace proj_data = np.dot(validation_data, approx_ng_subspace) # evaluate data clustering by algorithm score = utilities.get_result_score_by_kmeans(proj_data, validation_labels, 3) # score result utilities.print_score_fixed(score) # plot data in 2D & 3D plot_2d_data(proj_data, algorithm_params, validation_labels) plot_3d_data(proj_data, algorithm_params, validation_labels) return score
def score_ngca_on_oil_data_by_svm(alpha1, alpha2, beta1, beta2): # get samples and labels from train and validation data train_data = utilities.download_data('DataTrn') train_labels = utilities.download_labels('DataTrn') validation_data = utilities.download_data('DataVdn') validation_labels = utilities.download_labels('DataVdn') # Run algorithm on samples from train data train_samples, train_samples_copy = utilities.download_data('DataTrn', separate_data=True) approx_train_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, alpha1, alpha2, beta1, beta2) print('reduced 12 dimensions to {} dimensions'.format(approx_train_ng_subspace.shape[1])) # in case subspace is not of dimension between 3 and 6 return the worst score - invalid dimensions if approx_train_ng_subspace.shape[1] < 3 or approx_train_ng_subspace.shape[1] > 6: print('Train NG subspace dimension should be between 3 and 6') return 1 # Project train data on the result subspace to extract the NG components proj_train_data = np.dot(train_data, approx_train_ng_subspace) # Run algorithm on samples from validation data validation_samples, validation_samples_copy = utilities.download_data('DataVdn', separate_data=True) approx_validation_ng_subspace = run_ngca_algorithm(validation_samples, validation_samples_copy, alpha1, alpha2, beta1, beta2) print('reduced 12 dimensions to {} dimensions'.format(approx_validation_ng_subspace.shape[1])) # in case subspace is not of dimension between 3 and 6 return the worst score - invalid dimensions if approx_validation_ng_subspace.shape[1] < 3 or approx_validation_ng_subspace.shape[1] > 6: print('Validation NG subspace dimension should be between 3 and 6') return 1 if approx_train_ng_subspace.shape[1] != approx_validation_ng_subspace.shape[1]: print('Validation and Train NG subspace dimensions are different') return 1 # Project validation data on the result subspace to extract the NG components proj_validation_data = np.dot(validation_data, approx_validation_ng_subspace) # build SVM classifier - fit by train data and check prediction of validation data # clf = SVC(gamma='auto') clf = SVC(kernel='rbf', C=500, gamma=0.1) clf.fit(proj_train_data, train_labels) # assign score score = clf.score(proj_validation_data, validation_labels) # score by SVM model train_score = clf.score(proj_train_data, train_labels) # score by SVM model print('train score: {}'.format(train_score)) return 1 - score # we want to minimize score
def evaluate_test_data_by_kmeans(algorithm_params): samples, samples_copy = utilities.download_data( 'blanchard_clover_shuffled_full', separate_data=True) shuffled_data_full = utilities.download_data( 'blanchard_clover_shuffled_full') clover_data = utilities.download('blanchard_clover_data') # run NGCA on shuffled data approx_ng_subspace = run_ngca_algorithm(samples, samples_copy, algorithm_params) # Project shuffled_data on the result subspace proj_data = np.dot(shuffled_data_full, approx_ng_subspace) # plot data in 2D plot_2d_data(clover_data, shuffled_data_full, proj_data) return
def evaluate_test_data_by_svm(algorithm_params): # samples, samples_copy = utilities.download_data('cloverDataShuffledTst', separate_data=True) samples, samples_copy = utilities.download_data('cloverDataShuffledTrn', separate_data=True) # shuffled_data_full = utilities.download_data('cloverDataShuffledTst') shuffled_data_full = utilities.download_data('cloverDataShuffledTrn') # clover_data = utilities.download('cloverDataTst') clover_data = utilities.download('cloverDataTrn') # run NGCA on shuffled data approx_ng_subspace = run_ngca_algorithm(samples, samples_copy, algorithm_params) # Project shuffled_data on the result subspace proj_data = np.dot(shuffled_data_full, approx_ng_subspace) # plot data in 2D plot_2d_data(clover_data, shuffled_data_full, proj_data) return
def score_ngca_on_oil_data_by_kmeans(alpha1, alpha2, beta1, beta2): # get samples from train data train_samples, train_samples_copy = utilities.download_data('DataTrn', separate_data=True) # get samples and labels from validation data validation_data = utilities.download_data('DataVdn') validation_labels = utilities.download_labels('DataVdn') # run NGCA on train data approx_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, alpha1, alpha2, beta1, beta2) # Project validation data on the result subspace proj_data = np.dot(validation_data, approx_ng_subspace) # evaluate data clustering by algorithm score = utilities.get_result_score_by_kmeans(proj_data, validation_labels, 3) return score
def score_ngca_algorithm_on_clover_data_by_kmeans(alpha1, alpha2, beta1, beta2): samples, samples_copy = utilities.download_data('blanchard_clover_shuffled_full', separate_data=True) shuffled_data_full = utilities.download_data('blanchard_clover_shuffled_full') clover_data = utilities.download('blanchard_clover_data') kmeans_clover = KMeans(n_clusters=4, random_state=0).fit(clover_data) # Get 4 clusters labels clover_kmeans_labels = kmeans_clover.labels_ # run NGCA on shuffled data approx_ng_subspace = run_ngca_algorithm(samples, samples_copy, alpha1, alpha2, beta1, beta2) # Project shuffled_data on the result subspace proj_data = np.dot(shuffled_data_full, approx_ng_subspace) # evaluate result data by KMEANS kmeans_clover = KMeans(n_clusters=4, random_state=0).fit(proj_data) # Get 4 clusters labels predicted_result_labels = kmeans_clover.labels_ score = utilities.score_labels(clover_kmeans_labels, predicted_result_labels) return score
def main(argv=None): # IGNORE:C0111 """Command line options.""" if argv is not None: sys.argv.extend(argv) program_name = os.path.basename(sys.argv[0]) program_version = "v%s" % __version__ program_build_date = str(__updated__) program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date) program_shortdesc = __import__('__main__').__doc__.split("\n")[1] program_license = '''%s - %s Created by user_name on %s. Copyright 2013 Freedom. All rights reserved. Licensed under the Apache License 2.0 http://www.apache.org/licenses/LICENSE-2.0 Distributed on an "AS IS" basis without warranties or conditions of any kind, either express or implied. USAGE ''' % (program_name, program_shortdesc, str(__date__)) # ---------------------------------------------------------------------------------------------------------------------- enable_workers = (cpu_count() > 1) # ---------------------------------------------------------------------------------------------------------------------- # Setup argument parser parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument('-V', '--version', action='version', version=program_version_message) parser.add_argument('-g', '--grades', default='A,B,C,D,E,F,G', help="Comma seperated list of credit grades to test [default: %(default)s]") parser.add_argument('-a', '--states', default='CA,AZ,FL,GA,IL,MD,NV,TX,NY', help="Comma separated list of states to test [default: %(default)s]") parser.add_argument('-t', '--terms', default='36,60', help="Comma separated list of loan terms to test [default: %(default)s]") parser.add_argument('-s', '--seed', default=100, help="Random Number Generator Seed [default: %(default)s]") parser.add_argument('-d', '--data', default="https://resources.lendingclub.com/LoanStats3a.csv.zip,https://resources.lendingclub.com/LoanStats3b.csv.zip,https://resources.lendingclub.com/LoanStats3c.csv.zip", help="Comma separated download paths for the notes data files [default: %(default)s]") parser.add_argument('-l', '--stats', default="LoanStats3a.csv.zip,LoanStats3b.csv.zip,LoanStats3c.csv.zip", help="Comma separated list of input Loan Stats CSV files [default: %(default)s]") parser.add_argument('-c', '--csvresults', default="lc_best.csv", help="Output best results CSV file [default: %(default)s]") parser.add_argument('-p', '--population_size', default=512, type=int, help="population size [default: %(default)s]") parser.add_argument('-i', '--iterations', default=4096, type=int, help="how many Genetic Algorithm iterations to perform [default: %(default)s]") parser.add_argument('-e', '--elite_rate', default=0.05, type=float, help="elite rate [default: %(default)s]") parser.add_argument('-m', '--mutation_rate', default=0.05, type=float, help="mutation rate [default: %(default)s]") parser.add_argument('-k', '--check', default=False, action='store_true', help="checking mode: open the CSV results file and filter the loans into a separate file [default: %(default)s]") parser.add_argument('-r', '--checkresults', default="LoanStatsNewFiltered.csv", help="file name for the filtered results used during checking mode [default: %(default)s]") parser.add_argument('-z', '--zmq', default=True, action='store_true', help="Use zmq libraries for multi-processing [default: %(default)s]") parser.add_argument('-q', '--sqlite', default=1, type=int, help="Use sqlite as the core processing engine for the backtesting [default: %(default)s]") parser.add_argument('-f', '--fitness_sort_size', default=1000, type=int, help="number of loans to limit the fitness sort size, the larger the longer and more optimal solution [default: %(default)s]") parser.add_argument('-y', '--young_loans_in_days', default=3 * 30, type=int, help="filter young loans if they are younger than specified number of days [default: %(default)s]") parser.add_argument('-w', '--workers', default=enable_workers * cpu_count(), type=int, help="number of workers defaults to the number of cpu cores [default: %(default)s]") parser.add_argument('-b', '--work_batch', default=75, type=int, help="size of work batch size to give to each worker [default: %(default)s]") # Process arguments args = parser.parse_args() if args.population_size < args.workers: args.workers = 1 enable_workers = 0 if enable_workers > 0 and args.zmq: enable_zmq = utilities.check_for_pyzmq() else: enable_zmq = False random.seed(args.seed) csv_download_files = args.data.split(',') csv_stats_files = args.stats.split(',') for i in range(len(csv_stats_files)): if os.path.exists(csv_stats_files[i]): sys.stdout.write("Using %s as the stats file\n" % csv_stats_files[i]) else: if i > len(csv_download_files): sys.stderr.write("This file does not exist locally: ", csv_stats_files[i], "but the corresponding download url does not exist either.", "Make sure the the number of comma separated download urls", "matches the comma seperate stats files", csv_stats_files) sys.exit(-1) sys.stdout.write("Downloading %s as data file\n" % csv_download_files[i]) utilities.download_data(csv_download_files[i], csv_stats_files[i]) if enable_workers: if enable_zmq: lcbt = ZmqLCBT(ConversionFilters, args, worker_idx=-1) lcbt.initialize() for worker_idx in range(args.workers): Process(target=zmq_worker, args=(worker_idx, args)).start() ga_test = ZmqGATest(BackTestFilters, lcbt, args) ga_test.run() else: # only need this one to initialize the data lcbt = LCBT(ConversionFilters, args, worker_idx=-1) lcbt.initialize() work_queue = MultiProcessingQueue() response_queue = MultiProcessingQueue() for worker_idx in range(args.workers): mp_worker(worker_idx, args, work_queue, response_queue) ga_test = ParallelGATest(BackTestFilters, lcbt, args, work_queue, response_queue) ga_test.run() else: lcbt = LCBT(ConversionFilters, args, worker_idx=-1) lcbt.initialize() ga_test = GATest(BackTestFilters, lcbt, args) ga_test.run() return 0
def main(): clover_data = utilities.download_data('blanchard_clover_data') shuffled_data = utilities.download_data('blanchard_clover_shuffled') projected_data = utilities.download_data('blanchard_clover_result') plot_2d_data(clover_data, shuffled_data, projected_data)