def training_and_classification_with_kfold_cross_validation(collection_name, k): ''' Training and classification of an autotagger using k-fold cross validation ''' _split_metadata_and_features(collection_name, k) for i in range(1,k+1): # Create a gaia dataset with the training set print "----------------------- DATASET CREATION (FOLD %d)-----------------------" % i training_features='train/%s_features__fold%d.tsv' % (collection_name, i) chunk_size=5000 dataset_suffix="fold%d" % i replace_dataset=True dataset_creator = DatasetCreator(collection_name) dataset_creator.create(training_features, chunk_size, dataset_suffix, replace_dataset) # Feature selection over the gaia dataset print "----------------------- FEATURE SELECTION (FOLD %d)-----------------------" % i dataset='dbs/%s__fold%d.db' % (collection_name, i) pca_covered_variance=75 include_highlevel=True feature_selector = FeatureSelector() feature_selector.select(dataset, pca_covered_variance, include_highlevel) # Autotag a given test set print "----------------------- AUTOTAGGING (FOLD %d)-----------------------" % i dataset='transformed_dbs/%s__fold%d.db' % (collection_name, i) training_metadata='train/%s_metadata__fold%d.tsv' % (collection_name, i) test_features='test/%s_features__fold%d.tsv' % (collection_name, i) output_binary='test/%s_output_binary__fold%d.tsv' % (collection_name, i) output_affinity='test/%s_output_affinity__fold%d.tsv' % (collection_name, i) metric='LC' num_sim=18 threshold=0.2 autotagger = Autotagger() autotagger.train(dataset, training_metadata) autotagger.classify(test_features, output_binary, metric, num_sim, threshold, ranked=False) autotagger.classify(test_features, output_affinity, metric, num_sim, threshold, ranked=True)
if args.dataset is None: args.dataset = "transformed_dbs/"+args.collection_name+".db" if not os.path.exists(args.dataset): print "Dataset '%s' not found" % args.dataset sys.exit(-1) if args.training_metadata is None: args.training_metadata = "train/"+args.collection_name+"_metadata.tsv" if not os.path.exists(args.training_metadata): print "Training metadata file '%s' not found" % args.training_metadata sys.exit(-1) if args.test_features is None: args.test_features = "test/"+args.collection_name+"_features.tsv" if not os.path.exists(args.test_features): print "Test features file '%s' not found" % args.test_features sys.exit(-1) if args.output is None: args.output = "test/"+args.collection_name+"_output.tsv" print args autotagger = Autotagger() autotagger.train(args.dataset, args.training_metadata) autotagger.classify(args.test_features, args.output, args.metric, args.num_sim, args.threshold, args.ranked_tags)