def main(argv=None): ''' Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str ''' # Get command line arguments parser = argparse.ArgumentParser( description="Loads a trained model and outputs predictions based \ on input feature files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, conflict_handler='resolve') parser.add_argument('model_file', help='Model file to load and use for generating \ predictions.') parser.add_argument('input_file', help='A csv file, json file, or megam file \ (with or without the label column), \ with the appropriate suffix.', nargs='+') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') parser.add_argument('-p', '--positive_class', help="If the model is only being used to predict the \ probability of a particular class, this \ specifies the index of the class we're \ predicting. 1 = second class, which is default \ for binary classification. Keep in mind that \ classes are sorted lexicographically.", default=1, type=int) parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('-t', '--threshold', help="If the model we're using is generating \ probabilities of the positive class, return 1 \ if it meets/exceeds the given threshold and 0 \ otherwise.", type=float) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) # Create the classifier and load the model predictor = Predictor(args.model_file, positive_class=args.positive_class, threshold=args.threshold) for input_file in args.input_file: data = load_examples(input_file, quiet=args.quiet, label_col=args.label_col) for pred in predictor.predict(data): print(pred)
def main(): ''' Create directories and split CSV files into subsets. ''' logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=logging.INFO) logger = logging.getLogger(__name__) if not (os.path.exists('train.csv') and os.path.exists('test.csv')): logger.error('This script requires the train.csv and test.csv files ' + 'from http://www.kaggle.com/c/titanic-gettingStarted/' + 'data to be in the current directory in order to work. ' + 'Please download them and try again.') sys.exit(1) # Create dictionary of subsets to use for creating split feature files subset_dict = {'vitals': ['Sex', 'Age'], 'socioeconomic': ['Pclass', 'Fare'], 'family': ['SibSp', 'Parch'], 'misc': ['Embarked']} # Create directories to store files if not os.path.exists('titanic/train'): logger.info('Creating titanic/train directory') os.makedirs('titanic/train') if not os.path.exists('titanic/dev'): logger.info('Creating titanic/dev directory') os.makedirs('titanic/dev') if not os.path.exists('titanic/train+dev'): logger.info('Creating titanic/train+dev directory') os.makedirs('titanic/train+dev') if not os.path.exists('titanic/test'): logger.info('Creating titanic/test directory') os.makedirs('titanic/test') # Read and write training examples train_examples = load_examples('train.csv', label_col='Survived', quiet=False, sparse=False) num_train_dev = len(train_examples.classes) num_train = int((num_train_dev / 5) * 4) train_ids = list(range(1, num_train_dev + 1)) write_feature_file('titanic/train/.csv', train_ids[:num_train], train_examples.classes[:num_train], train_examples.features[:num_train, :], feat_vectorizer=train_examples.feat_vectorizer, subsets=subset_dict, label_col='Survived', id_prefix='train_example') # Write train+dev set for training model to use to generate predictions on test write_feature_file('titanic/train+dev/.csv', train_ids, train_examples.classes, train_examples.features, feat_vectorizer=train_examples.feat_vectorizer, subsets=subset_dict, label_col='Survived', id_prefix='train_example') # Write dev examples write_feature_file('titanic/dev/.csv', train_ids[num_train:], train_examples.classes[num_train:], train_examples.features[num_train:, :], feat_vectorizer=train_examples.feat_vectorizer, subsets=subset_dict, label_col='Survived', id_prefix='dev_example') # Read and write test examples test_examples = load_examples('test.csv', label_col='Survived', quiet=False, sparse=False) num_test = len(test_examples.classes) test_ids = list(range(num_train_dev + 1, num_test + num_train_dev + 1)) write_feature_file('titanic/test/.csv', test_ids, test_examples.classes, test_examples.features, feat_vectorizer=test_examples.feat_vectorizer, subsets=subset_dict, label_col='Survived', id_prefix='test_example')