コード例 #1
0
ファイル: ngca_algorithm.py プロジェクト: shanys8/ngca
def score_ngca_on_clover_data_by_svm(alpha1, alpha2, beta1, beta2):

    # get samples and labels from train and validation data
    train_shuffled_data = utilities.download_data('cloverDataShuffledTrn')
    train_data = utilities.download_data('cloverDataTrn')
    kmeans_train_data = KMeans(n_clusters=4, random_state=0).fit(train_data)  # Get 4 clusters labels
    train_labels = kmeans_train_data.labels_

    # validation_shuffled_data = utilities.download_data('cloverDataShuffledVdn')
    # validation_data = utilities.download_data('cloverDataVdn')
    # kmeans_validation_data = KMeans(n_clusters=4, random_state=0).fit(validation_data)  # Get 4 clusters labels
    # validation_labels = kmeans_validation_data.labels_

    # Run algorithm on samples from train data
    train_samples, train_samples_copy = utilities.download_data('cloverDataShuffledTrn', separate_data=True)
    approx_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, alpha1, alpha2, beta1, beta2)

    # in case subspace is not of dimension 2 then return the worst score
    if approx_ng_subspace.shape[1] != 2:
        print('subspace dimension is not 2')
        return 1

    # Project train data on the result subspace
    proj_train_shuffled_data = np.dot(train_shuffled_data, approx_ng_subspace)

    # build SVM classifier - fit by train data and check predication of validation data
    clf = SVC(kernel='rbf', C=500, gamma=0.1)
    clf.fit(proj_train_shuffled_data, train_labels)
    predicted_train_labels = clf.predict(proj_train_shuffled_data)

    # assign score
    score = clf.score(proj_train_shuffled_data, train_labels)  # score by SVM model
    # score = adjusted_rand_score(train_labels, predicted_train_labels)
    return 1 - score  # we want to minimize score
コード例 #2
0
def evaluate_test_data_by_svm(algorithm_params):

    # get samples from test data
    test_samples, test_samples_copy = utilities.download_data('DataTst', separate_data=True)
    train_samples, train_samples_copy = utilities.download_data('DataTrn', separate_data=True)

    # get samples and labels from train and test data
    train_data = utilities.download_data('DataTrn')
    train_labels = utilities.download_labels('DataTrn')
    test_data = utilities.download_data('DataTst')
    test_labels = utilities.download_labels('DataTst')

    # Run algorithm on samples from test data
    approx_train_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, algorithm_params)
    approx_test_ng_subspace = run_ngca_algorithm(test_samples, test_samples_copy, algorithm_params)

    # Project train data on the result subspace
    proj_train_data = np.dot(train_data, approx_train_ng_subspace)
    proj_test_data = np.dot(test_data, approx_test_ng_subspace)

    # build SVM classifier - fit by train data and check predication (score) of test data
    # clf = SVC(gamma='auto')
    clf = SVC(kernel='rbf', C=500, gamma=0.1)
    clf.fit(proj_train_data, train_labels)

    # assign score
    score = clf.score(proj_test_data, test_labels)
    print('Score on test data:')
    utilities.print_score(score)

    plot_2d_data(proj_test_data, test_labels, algorithm_params)
    plot_3d_data(proj_test_data, test_labels, algorithm_params)

    return
コード例 #3
0
def evaluate_test_data_by_kmeans(algorithm_params):

    # get samples from test data
    test_samples, test_samples_copy = utilities.download_data(
        'DataTst', separate_data=True)

    # get samples and labels from test data
    test_data = utilities.download_data('DataTst')
    test_labels = utilities.download_labels('DataTst')

    # Run algorithm on samples from train data
    approx_ng_subspace = run_ngca_algorithm(test_samples, test_samples_copy,
                                            algorithm_params)

    # Project test data on the result subspace
    proj_data = np.dot(test_data, approx_ng_subspace)

    # evaluate data clustering by algorithm
    score = utilities.get_result_score_by_kmeans(proj_data, test_labels, 3)

    print('Score on test data:')
    utilities.print_score_fixed(score)

    # plot data in 2D & 3D
    plot_2d_data(proj_data, algorithm_params, test_labels)
    plot_3d_data(proj_data, algorithm_params, test_labels)

    return
コード例 #4
0
def scoring_by_kmeans():
    # get samples from train data
    train_samples, train_samples_copy = utilities.download_data(
        'DataTrn', separate_data=True)

    # get samples and labels from validation data
    validation_data = utilities.download_data('DataVdn')
    validation_labels = utilities.download_labels('DataVdn')

    algorithm_params = {
        'alpha1': 0.7,
        'alpha2': 0.3,
        'beta1': 0.34,
        'beta2': 0.64,
    }

    # Run algorithm on samples from train data
    approx_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy,
                                            algorithm_params)

    # Project validation data on the result subspace
    proj_data = np.dot(validation_data, approx_ng_subspace)

    # evaluate data clustering by algorithm
    score = utilities.get_result_score_by_kmeans(proj_data, validation_labels,
                                                 3)

    # score result
    utilities.print_score_fixed(score)

    # plot data in 2D & 3D
    plot_2d_data(proj_data, algorithm_params, validation_labels)
    plot_3d_data(proj_data, algorithm_params, validation_labels)

    return score
コード例 #5
0
ファイル: ngca_algorithm.py プロジェクト: shanys8/ngca
def score_ngca_on_oil_data_by_svm(alpha1, alpha2, beta1, beta2):

    # get samples and labels from train and validation data
    train_data = utilities.download_data('DataTrn')
    train_labels = utilities.download_labels('DataTrn')
    validation_data = utilities.download_data('DataVdn')
    validation_labels = utilities.download_labels('DataVdn')

    # Run algorithm on samples from train data
    train_samples, train_samples_copy = utilities.download_data('DataTrn', separate_data=True)
    approx_train_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, alpha1, alpha2, beta1, beta2)
    print('reduced 12 dimensions to {} dimensions'.format(approx_train_ng_subspace.shape[1]))

    # in case subspace is not of dimension between 3 and 6 return the worst score - invalid dimensions
    if approx_train_ng_subspace.shape[1] < 3 or approx_train_ng_subspace.shape[1] > 6:
        print('Train NG subspace dimension should be between 3 and 6')
        return 1

    # Project train data on the result subspace to extract the NG components
    proj_train_data = np.dot(train_data, approx_train_ng_subspace)

    # Run algorithm on samples from validation data
    validation_samples, validation_samples_copy = utilities.download_data('DataVdn', separate_data=True)
    approx_validation_ng_subspace = run_ngca_algorithm(validation_samples, validation_samples_copy, alpha1, alpha2, beta1, beta2)
    print('reduced 12 dimensions to {} dimensions'.format(approx_validation_ng_subspace.shape[1]))

    # in case subspace is not of dimension between 3 and 6 return the worst score - invalid dimensions
    if approx_validation_ng_subspace.shape[1] < 3 or approx_validation_ng_subspace.shape[1] > 6:
        print('Validation NG subspace dimension should be between 3 and 6')
        return 1

    if approx_train_ng_subspace.shape[1] != approx_validation_ng_subspace.shape[1]:
        print('Validation and Train NG subspace dimensions are different')
        return 1

    # Project validation data on the result subspace to extract the NG components
    proj_validation_data = np.dot(validation_data, approx_validation_ng_subspace)

    # build SVM classifier - fit by train data and check prediction of validation data
    # clf = SVC(gamma='auto')
    clf = SVC(kernel='rbf', C=500, gamma=0.1)
    clf.fit(proj_train_data, train_labels)

    # assign score
    score = clf.score(proj_validation_data, validation_labels)  # score by SVM model
    train_score = clf.score(proj_train_data, train_labels)  # score by SVM model
    print('train score: {}'.format(train_score))
    return 1 - score  # we want to minimize score
コード例 #6
0
def evaluate_test_data_by_kmeans(algorithm_params):
    samples, samples_copy = utilities.download_data(
        'blanchard_clover_shuffled_full', separate_data=True)
    shuffled_data_full = utilities.download_data(
        'blanchard_clover_shuffled_full')
    clover_data = utilities.download('blanchard_clover_data')

    # run NGCA on shuffled data
    approx_ng_subspace = run_ngca_algorithm(samples, samples_copy,
                                            algorithm_params)

    # Project shuffled_data on the result subspace
    proj_data = np.dot(shuffled_data_full, approx_ng_subspace)

    # plot data in 2D
    plot_2d_data(clover_data, shuffled_data_full, proj_data)

    return
コード例 #7
0
def evaluate_test_data_by_svm(algorithm_params):
    # samples, samples_copy = utilities.download_data('cloverDataShuffledTst', separate_data=True)
    samples, samples_copy = utilities.download_data('cloverDataShuffledTrn', separate_data=True)
    # shuffled_data_full = utilities.download_data('cloverDataShuffledTst')
    shuffled_data_full = utilities.download_data('cloverDataShuffledTrn')
    # clover_data = utilities.download('cloverDataTst')
    clover_data = utilities.download('cloverDataTrn')

    # run NGCA on shuffled data
    approx_ng_subspace = run_ngca_algorithm(samples, samples_copy, algorithm_params)

    # Project shuffled_data on the result subspace
    proj_data = np.dot(shuffled_data_full, approx_ng_subspace)

    # plot data in 2D
    plot_2d_data(clover_data, shuffled_data_full, proj_data)

    return
コード例 #8
0
ファイル: ngca_algorithm.py プロジェクト: shanys8/ngca
def score_ngca_on_oil_data_by_kmeans(alpha1, alpha2, beta1, beta2):

    # get samples from train data
    train_samples, train_samples_copy = utilities.download_data('DataTrn', separate_data=True)

    # get samples and labels from validation data
    validation_data = utilities.download_data('DataVdn')
    validation_labels = utilities.download_labels('DataVdn')

    # run NGCA on train data
    approx_ng_subspace = run_ngca_algorithm(train_samples, train_samples_copy, alpha1, alpha2, beta1, beta2)

    # Project validation data on the result subspace
    proj_data = np.dot(validation_data, approx_ng_subspace)

    # evaluate data clustering by algorithm
    score = utilities.get_result_score_by_kmeans(proj_data, validation_labels, 3)

    return score
コード例 #9
0
ファイル: ngca_algorithm.py プロジェクト: shanys8/ngca
def score_ngca_algorithm_on_clover_data_by_kmeans(alpha1, alpha2, beta1, beta2):

    samples, samples_copy = utilities.download_data('blanchard_clover_shuffled_full', separate_data=True)
    shuffled_data_full = utilities.download_data('blanchard_clover_shuffled_full')
    clover_data = utilities.download('blanchard_clover_data')

    kmeans_clover = KMeans(n_clusters=4, random_state=0).fit(clover_data)  # Get 4 clusters labels
    clover_kmeans_labels = kmeans_clover.labels_

    # run NGCA on shuffled data
    approx_ng_subspace = run_ngca_algorithm(samples, samples_copy, alpha1, alpha2, beta1, beta2)

    # Project shuffled_data on the result subspace
    proj_data = np.dot(shuffled_data_full, approx_ng_subspace)

    # evaluate result data by KMEANS
    kmeans_clover = KMeans(n_clusters=4, random_state=0).fit(proj_data)  # Get 4 clusters labels
    predicted_result_labels = kmeans_clover.labels_

    score = utilities.score_labels(clover_kmeans_labels, predicted_result_labels)

    return score
コード例 #10
0
ファイル: lcbt.py プロジェクト: finallybiubiu/LendingClub
def main(argv=None):  # IGNORE:C0111
    """Command line options."""

    if argv is not None:
        sys.argv.extend(argv)

    program_name = os.path.basename(sys.argv[0])
    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
    program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s - %s

  Created by user_name on %s.
  Copyright 2013 Freedom. All rights reserved.
  
  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0
  
  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_name, program_shortdesc, str(__date__))

    # ----------------------------------------------------------------------------------------------------------------------
    enable_workers = (cpu_count() > 1)
    # ----------------------------------------------------------------------------------------------------------------------

    # Setup argument parser
    parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v", "--verbose", dest="verbose", action="count",
                        help="set verbosity level [default: %(default)s]")
    parser.add_argument('-V', '--version', action='version', version=program_version_message)
    parser.add_argument('-g', '--grades', default='A,B,C,D,E,F,G',
                        help="Comma seperated list of credit grades to test [default: %(default)s]")
    parser.add_argument('-a', '--states', default='CA,AZ,FL,GA,IL,MD,NV,TX,NY',
                        help="Comma separated list of states to test [default: %(default)s]")
    parser.add_argument('-t', '--terms', default='36,60',
                        help="Comma separated list of loan terms to test [default: %(default)s]")
    parser.add_argument('-s', '--seed', default=100, help="Random Number Generator Seed [default: %(default)s]")
    parser.add_argument('-d', '--data',
                        default="https://resources.lendingclub.com/LoanStats3a.csv.zip,https://resources.lendingclub.com/LoanStats3b.csv.zip,https://resources.lendingclub.com/LoanStats3c.csv.zip",
                        help="Comma separated download paths for the notes data files [default: %(default)s]")
    parser.add_argument('-l', '--stats', default="LoanStats3a.csv.zip,LoanStats3b.csv.zip,LoanStats3c.csv.zip",
                        help="Comma separated list of input Loan Stats CSV files [default: %(default)s]")
    parser.add_argument('-c', '--csvresults', default="lc_best.csv",
                        help="Output best results CSV file [default: %(default)s]")
    parser.add_argument('-p', '--population_size', default=512, type=int, help="population size [default: %(default)s]")
    parser.add_argument('-i', '--iterations', default=4096, type=int,
                        help="how many Genetic Algorithm iterations to perform [default: %(default)s]")
    parser.add_argument('-e', '--elite_rate', default=0.05, type=float, help="elite rate [default: %(default)s]")
    parser.add_argument('-m', '--mutation_rate', default=0.05, type=float, help="mutation rate [default: %(default)s]")
    parser.add_argument('-k', '--check', default=False, action='store_true',
                        help="checking mode: open the CSV results file and filter the loans into a separate file [default: %(default)s]")
    parser.add_argument('-r', '--checkresults', default="LoanStatsNewFiltered.csv",
                        help="file name for the filtered results used during checking mode [default: %(default)s]")
    parser.add_argument('-z', '--zmq', default=True, action='store_true',
                        help="Use zmq libraries for multi-processing [default: %(default)s]")
    parser.add_argument('-q', '--sqlite', default=1, type=int,
                        help="Use sqlite as the core processing engine for the backtesting [default: %(default)s]")
    parser.add_argument('-f', '--fitness_sort_size', default=1000, type=int,
                        help="number of loans to limit the fitness sort size, the larger the longer and more optimal solution [default: %(default)s]")
    parser.add_argument('-y', '--young_loans_in_days', default=3 * 30, type=int,
                        help="filter young loans if they are younger than specified number of days [default: %(default)s]")
    parser.add_argument('-w', '--workers', default=enable_workers * cpu_count(), type=int,
                        help="number of workers defaults to the number of cpu cores [default: %(default)s]")
    parser.add_argument('-b', '--work_batch', default=75, type=int,
                        help="size of work batch size to give to each worker [default: %(default)s]")

    # Process arguments
    args = parser.parse_args()

    if args.population_size < args.workers:
        args.workers = 1
        enable_workers = 0

    if enable_workers > 0 and args.zmq:
        enable_zmq = utilities.check_for_pyzmq()
    else:
        enable_zmq = False

    random.seed(args.seed)

    csv_download_files = args.data.split(',')
    csv_stats_files = args.stats.split(',')

    for i in range(len(csv_stats_files)):
        if os.path.exists(csv_stats_files[i]):
            sys.stdout.write("Using %s as the stats file\n" % csv_stats_files[i])
        else:
            if i > len(csv_download_files):
                sys.stderr.write("This file does not exist locally: ", csv_stats_files[i],
                                 "but the corresponding download url does not exist either.",
                                 "Make sure the the number of comma separated download urls",
                                 "matches the comma seperate stats files", csv_stats_files)
                sys.exit(-1)
            sys.stdout.write("Downloading %s as data file\n" % csv_download_files[i])
            utilities.download_data(csv_download_files[i], csv_stats_files[i])

    if enable_workers:
        if enable_zmq:
            lcbt = ZmqLCBT(ConversionFilters, args, worker_idx=-1)
            lcbt.initialize()
            for worker_idx in range(args.workers):
                Process(target=zmq_worker, args=(worker_idx, args)).start()
            ga_test = ZmqGATest(BackTestFilters, lcbt, args)
            ga_test.run()
        else:
            # only need this one to initialize the data
            lcbt = LCBT(ConversionFilters, args, worker_idx=-1)
            lcbt.initialize()
            work_queue = MultiProcessingQueue()
            response_queue = MultiProcessingQueue()
            for worker_idx in range(args.workers):
                mp_worker(worker_idx, args, work_queue, response_queue)
            ga_test = ParallelGATest(BackTestFilters, lcbt, args, work_queue, response_queue)
            ga_test.run()
    else:
        lcbt = LCBT(ConversionFilters, args, worker_idx=-1)
        lcbt.initialize()
        ga_test = GATest(BackTestFilters, lcbt, args)
        ga_test.run()

    return 0
コード例 #11
0
def main():

    clover_data = utilities.download_data('blanchard_clover_data')
    shuffled_data = utilities.download_data('blanchard_clover_shuffled')
    projected_data = utilities.download_data('blanchard_clover_result')
    plot_2d_data(clover_data, shuffled_data, projected_data)